diff --git a/.env.example b/.env.example index b662720..3215f98 100644 --- a/.env.example +++ b/.env.example @@ -2,6 +2,14 @@ APP_ENV=development NEXT_PUBLIC_APP_URL=http://localhost:3000 +# TASK-8 Playwright +TASK8_CRAWL_TIMEOUT_MS=60000 +TASK8_CRAWL_MAX_PAGES=20 +TASK8_BROWSER_ASSET_URL= +# Legacy aliases (optional fallback, prefer TASK8_BROWSER_ASSET_URL): +# TASK8_CHROMIUM_EXECUTABLE_URL= +# TASK8_CHROMIUM_EXECUTABLE= + # Convex NEXT_PUBLIC_CONVEX_URL= CONVEX_DEPLOYMENT= diff --git a/README.md b/README.md index 0aecf57..86bb6d7 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ Copy `.env.example` to `.env.local` for local development. Keep real secrets out - **SMTP / Stalwart:** `SMTP_HOST`, `SMTP_PORT`, `SMTP_USER`, `SMTP_PASSWORD`, `SMTP_FROM` - **Rybbit:** `RYBBIT_API_URL`, `RYBBIT_API_KEY`, `NEXT_PUBLIC_RYBBIT_SITE_ID` - **Auth:** `BETTER_AUTH_SECRET` +- **TASK-8 enrichment:** `TASK8_BROWSER_ASSET_URL` Only variables prefixed with `NEXT_PUBLIC_` are intended for browser exposure. All API keys, SMTP credentials, and server-only URLs must stay server-side. @@ -48,3 +49,25 @@ Only variables prefixed with `NEXT_PUBLIC_` are intended for browser exposure. A ## Deployment Notes Coolify should run `pnpm install`, `pnpm build`, and `pnpm start`. The current font setup uses `next/font/google`, so production builds need outbound access to Google Fonts unless fonts are later self-hosted. + +TASK-8 enrichment uses `playwright-core` with `@sparticuz/chromium-min` in Convex. Local `npx playwright install` is a browser-testing helper only and does not affect the Convex runtime bundle. + +TASK-8 requires a browser binary source URL configured on Convex. The preferred +variable is: + +- `TASK8_BROWSER_ASSET_URL` (for example your self-hosted or CDN Chromium bundle URL if you do not rely on package defaults). + +For backward compatibility, the action also supports: + +- `TASK8_CHROMIUM_EXECUTABLE_URL` +- `TASK8_CHROMIUM_EXECUTABLE` + +If none are set, enrichment deployment/startup will fail with a clear configuration +error so no silent fallback is used. + +If the URL is missing and no default is available in your environment, the enqueue action will throw a clear deploy/configuration error so enrichment does not silently fall back to a missing binary. + +For TASK-8 deployment updates, run Convex restart/deploy after code changes: + +- Local: `pnpm exec convex dev` +- Remote: `pnpm exec convex deploy` diff --git a/backlog/tasks/task-20 - Convert-campaigns-and-leads-to-compact-cards.md b/backlog/tasks/task-20 - Convert-campaigns-and-leads-to-compact-cards.md new file mode 100644 index 0000000..00473ba --- /dev/null +++ b/backlog/tasks/task-20 - Convert-campaigns-and-leads-to-compact-cards.md @@ -0,0 +1,51 @@ +--- +id: TASK-20 +title: Convert campaigns and leads to compact cards +status: Done +assignee: [] +created_date: '2026-06-04 15:01' +updated_date: '2026-06-04 15:10' +labels: [] +dependencies: [] +priority: high +ordinal: 22000 +--- + +## Description + + +Update the dashboard campaign and lead review UI so campaigns render as individual cards and leads render as compact expandable cards while preserving existing Convex behavior. + + +## Acceptance Criteria + +- [x] #1 Campaigns page renders each campaign as its own responsive card instead of a desktop table. +- [x] #2 Leads page renders compact cards showing company/name, contact data, and priority while hiding review fields behind Mehr anzeigen. +- [x] #3 Expanded lead cards preserve all existing review fields and save/block actions. +- [x] #4 UI remains responsive without horizontal table overflow on desktop and mobile. +- [x] #5 Lint and test verification are run and results are documented. + + + + +## Implementation Plan + + +1. Add/adjust tests or static checks that fail for table-based Campaigns/Leads layouts before production edits. +2. Convert CampaignsBoard from desktop table plus mobile cards to one responsive card list. +3. Convert LeadsReviewTable from table rows to compact expandable cards. +4. Run lint, tests, and browser/responsive verification. +5. Record verification notes in Backlog; wait for user confirmation before Done. + + +## Implementation Notes + + +Implemented via subagent-driven TDD. Campaigns and Leads converted from table layouts to compact cards. Added static layout regression tests for campaign cards and lead expandable cards. Verification: pnpm lint exits 0 with 2 pre-existing generated Better Auth warnings; pnpm test passes 107/107; pnpm build passes after rerun with network access for Google Fonts. Browser automation could launch only outside sandbox, but authenticated dashboard routes redirected to /login in the fresh Playwright context, so final visual validation should be done in the existing logged-in browser session. + + +## Final Summary + + +Campaigns now render as responsive cards on all breakpoints. Leads now render as compact expandable cards showing company/contact/priority by default and revealing review fields/actions through Mehr anzeigen. Added regression tests for both card layouts. Verified with pnpm lint, pnpm test, and pnpm build; browser automation reached login due fresh unauthenticated context, while user confirmed the authenticated UI manually. + diff --git a/backlog/tasks/task-21 - Replace-oversized-Convex-browser-runtime-dependency.md b/backlog/tasks/task-21 - Replace-oversized-Convex-browser-runtime-dependency.md new file mode 100644 index 0000000..bfff111 --- /dev/null +++ b/backlog/tasks/task-21 - Replace-oversized-Convex-browser-runtime-dependency.md @@ -0,0 +1,45 @@ +--- +id: TASK-21 +title: Replace oversized Convex browser runtime dependency +status: In Progress +assignee: [] +created_date: '2026-06-04 15:30' +updated_date: '2026-06-04 16:41' +labels: [] +dependencies: [] +priority: high +ordinal: 23000 +--- + +## Description + + +Reduce Convex function module size by replacing @sparticuz/chromium with a minimal serverless Chromium strategy for websiteEnrichmentAction while keeping screenshot/crawl functionality. + + +## Acceptance Criteria + +- [x] #1 Action no longer imports @sparticuz/chromium +- [x] #2 Convex external package list reflects the replacement +- [x] #3 Deployment guidance includes required env var and failure mode for missing browser URL + + +## Implementation Plan + + +1. Verify existing oversized browser dependency path in Convex action and env strategy +2. Replace @sparticuz/chromium with chromium-min + runtime executable source env var +3. Validate by TS/typecheck + + +## Implementation Notes + + +Durchgeführt: Dependency-Swap auf @sparticuz/chromium-min und Nutzung von runtime executableSource aus ENV in convex/websiteEnrichmentAction.ts. convex.json ExternalPackages auf Chromium-Min aktualisiert. Konfigurierter Fehlerpfad bei fehlender Chromium-Variable. + +Final verification passed after switching to @sparticuz/chromium-min with TASK8_BROWSER_ASSET_URL as primary runtime browser asset source. Convex codegen dry-run/typecheck now uploads functions successfully; previous ModulesTooLarge error is resolved. + +Follow-up for repeated /tmp/chromium cannot execute binary file: Context7 confirmed chromium-min remote pack usage; local package code reuses existing /tmp/chromium. Added marker-based /tmp cache invalidation keyed by TASK8_BROWSER_ASSET_URL so architecture/source changes remove stale /tmp/chromium and /tmp/chromium-pack before executablePath(). Verification passed: pnpm exec tsc -p tsconfig.json; pnpm test (108/108); pnpm lint (existing generated BetterAuth warnings only); pnpm exec convex codegen --dry-run --typecheck enable. + +Follow-up for libnspr4.so runtime error: Context7 and local @sparticuz/chromium-min docs show remote pack includes al2023.tar.br, but package only auto-inflates it when AL2023 detection fires. Convex needs those shared libs without being detected. Added explicit AL2023 shared-library preparation after executablePath(): inflate CHROMIUM_PACK_PATH/al2023.tar.br and setupLambdaEnvironment(/tmp/al2023/lib) before Playwright launch. Verification passed: pnpm exec tsc -p tsconfig.json; pnpm test (109/109); pnpm lint (existing generated BetterAuth warnings only); pnpm exec convex codegen --dry-run --typecheck enable. + diff --git a/backlog/tasks/task-22 - Add-source-assertions-for-Convex-AL2023-Chromium-lib-setup.md b/backlog/tasks/task-22 - Add-source-assertions-for-Convex-AL2023-Chromium-lib-setup.md new file mode 100644 index 0000000..56c18db --- /dev/null +++ b/backlog/tasks/task-22 - Add-source-assertions-for-Convex-AL2023-Chromium-lib-setup.md @@ -0,0 +1,40 @@ +--- +id: TASK-22 +title: Add source assertions for Convex AL2023 Chromium lib setup +status: In Progress +assignee: [] +created_date: '2026-06-04 16:37' +updated_date: '2026-06-04 16:41' +labels: [] +dependencies: [] +priority: high +ordinal: 24000 +--- + +## Description + + +Add tests that fail until websiteEnrichmentAction explicitly handles AL2023 shared libs for chromium-min packaging in Convex. + + +## Acceptance Criteria + +- [x] #1 Test asserts chromium-min dynamic import exposes inflate/setupLambdaEnvironment or explicit LD_LIBRARY_PATH handling for /tmp/al2023/lib. +- [x] #2 Assertion checks that runtime setup runs before Playwright launch and after executablePath resolution. + + +## Implementation Plan + + +1. Add source assertions for AL2023 runtime setup and launch ordering +2. Run focused website-enrichment action test +3. Confirm failing output and report + + +## Implementation Notes + + +Added source-only assertion in tests/website-enrichment-action.test.ts for AL2023 lib setup. Targeted run `pnpm tsc -p tsconfig.test.json && node --test .test-output/tests/website-enrichment-action.test.js` currently fails as expected on current action source (missing setup/LD_LIBRARY_PATH/al2023 archive handling). + +GREEN follow-up completed: runtime action now exposes chromium-min inflate/setupLambdaEnvironment, prepares /tmp/al2023/lib after executablePath resolution and before Playwright launch, and focused/full verification passes. + diff --git a/backlog/tasks/task-23 - Improve-website-email-extraction.md b/backlog/tasks/task-23 - Improve-website-email-extraction.md new file mode 100644 index 0000000..6e1a3e2 --- /dev/null +++ b/backlog/tasks/task-23 - Improve-website-email-extraction.md @@ -0,0 +1,35 @@ +--- +id: TASK-23 +title: Improve website email extraction +status: In Progress +assignee: [] +created_date: '2026-06-04 17:28' +updated_date: '2026-06-04 17:34' +labels: [] +dependencies: [] +priority: high +ordinal: 25000 +--- + +## Description + + +Fix TASK-8 website enrichment so Playwright crawls contact/imprint/footer email patterns that are visible on crawled pages but currently missed by the extractor. + + +## Acceptance Criteria + +- [x] #1 Extract mailto href emails even with query parameters and labels +- [x] #2 Extract common obfuscated German website email patterns such as [at], (at), at, and spaced @/dot forms +- [x] #3 Treat emails found on Kontakt/Impressum pages or footer contact context as business contact candidates without guessing addresses +- [x] #4 Keep TASK-7 rules intact: no generated emails, named emails require explicit business context +- [x] #5 Verify with focused RED/GREEN tests and full suite + + +## Implementation Notes + + +Updated website-crawler extractor to support mailto query stripping/decoding, HTML entity decoding for email separators, obfuscated [at]/(at)/dot/punkt and spaced @/dot forms, and expanded business-context detection for footer/impressum/contact regions. Limited to lib/website-crawler.ts only. + +Implemented via subagents/TDD: added RED tests for mailto query params, obfuscated email forms, footer/impressum usability, no-guessing false-positive guard, and mailto dedupe. Extractor now decodes common HTML entities, strips/decodes mailto query strings, parses [at]/(at)/punkt/dot/spaced forms with guardrails, expands footer/impressum/contact business context, and leaves TASK-7 selection unchanged. Verification passed: pnpm exec tsc -p tsconfig.json; pnpm test (114/114); pnpm lint (existing generated BetterAuth warnings only); pnpm exec convex codegen --dry-run --typecheck enable. + diff --git a/backlog/tasks/task-24 - Improve-crawler-handling-for-Bock-Rechtsanwaelte-edge-cases.md b/backlog/tasks/task-24 - Improve-crawler-handling-for-Bock-Rechtsanwaelte-edge-cases.md new file mode 100644 index 0000000..8847729 --- /dev/null +++ b/backlog/tasks/task-24 - Improve-crawler-handling-for-Bock-Rechtsanwaelte-edge-cases.md @@ -0,0 +1,50 @@ +--- +id: TASK-24 +title: Improve crawler handling for Bock Rechtsanwaelte edge cases +status: In Progress +assignee: [] +created_date: '2026-06-04 18:04' +updated_date: '2026-06-04 18:09' +labels: [] +dependencies: [] +priority: high +ordinal: 26000 +--- + +## Description + + +Investigate the remaining TASK-8 case where bock-rechtsanwaelte.de/impressum contains a visible email but website enrichment misses it, and address the same-domain timeout separately if reproducible. + + +## Acceptance Criteria + +- [x] #1 Reproduce the missing email against the public impressum page or captured HTML +- [x] #2 Add RED tests for the missed email/link pattern +- [x] #3 Keep no-guessing email rules intact +- [ ] #4 Add focused timeout mitigation only if root cause is identified +- [x] #5 Verify focused tests and full suite + + +## Implementation Plan + + +1. Inspect existing website crawler tests +2. Add failing regression tests for Bock Impressum +3. Keep no-context named-email rejection test unchanged +4. Run focused crawler test and confirm RED + + +## Implementation Notes + + +Working on adding focused RED tests for Bock Rechtsanwaelte email extraction failure; limiting changes to tests/website-crawler.test.ts + +Added 2 RED coverage tests in tests/website-crawler.test.ts. Focused run of .test-output/tests/website-crawler.test.js fails on 2 assertions: Bock Impressum candidate business-context false due expected mismatch behavior, and email-labeled mailto contactPerson currently equals the email string. + +Running minimal fix for Bock Impressum email context/labeling in lib/website-crawler.ts. Next: implement anchor-indexing fix and email-label guard, then run focused tests. + +Minimal scoped fix applied in lib/website-crawler.ts: mailto business-context now evaluates against raw input using anchor indices, and email-like labels matching normalized email do not become contactPerson. Verified via focused command: pnpm exec tsc -p tsconfig.test.json && node --test .test-output/tests/website-crawler.test.js (19/19 passing). + +Reproduced Bock Impressum against captured public HTML. Extractor found 5 candidates but all were business=false because mailto anchor offsets from original HTML were checked against normalized HTML; TASK-7 therefore returned null. Added RED tests for Bock-like Impressum mailto context and email-label contactPerson behavior. Fixed mailto path to evaluate business context against original input offsets and suppress contactPerson when anchor label is the email itself. Verified captured real HTML now returns usable chemnitz@bock-rechtsanwaelte.de. Full verification passed: pnpm exec tsc -p tsconfig.json; pnpm test (116/116); pnpm lint (existing generated BetterAuth warnings only); pnpm exec convex codegen --dry-run --typecheck enable. Timeout mitigation not changed yet because timeout root cause is not identified. + diff --git a/backlog/tasks/task-8 - Implement-Playwright-website-crawling-and-screenshot-capture.md b/backlog/tasks/task-8 - Implement-Playwright-website-crawling-and-screenshot-capture.md index 6fba9ec..525a750 100644 --- a/backlog/tasks/task-8 - Implement-Playwright-website-crawling-and-screenshot-capture.md +++ b/backlog/tasks/task-8 - Implement-Playwright-website-crawling-and-screenshot-capture.md @@ -1,10 +1,10 @@ --- id: TASK-8 title: Implement Playwright website crawling and screenshot capture -status: To Do +status: In Progress assignee: [] created_date: '2026-06-03 19:13' -updated_date: '2026-06-04 14:08' +updated_date: '2026-06-04 18:09' labels: - mvp - audit @@ -25,32 +25,51 @@ Build the website inspection and contact-enrichment layer using Playwright. For ## Acceptance Criteria -- [ ] #1 Playwright captures desktop and mobile screenshots for the homepage and stores them in Convex File Storage -- [ ] #2 Crawler visits a bounded set of relevant subpages: Kontakt, Impressum, Leistungen/Angebot, Über uns/Team when discoverable -- [ ] #3 Crawler extracts visible text, page title, meta description, headings, links, phone numbers, email candidates, email source URLs, contact-person context, and CTA/contact-form signals -- [ ] #4 Extracted email candidates are classified through the TASK-7 rules: generic business emails are preferred; named emails are accepted only when explicitly published as business contact addresses; no guessed addresses are generated -- [ ] #5 Leads discovered by Google Places with a website are automatically scheduled for contact enrichment before they remain in Kontakt fehlt; found usable email updates the lead contact fields and status while preserving phone and source data -- [ ] #6 Simple technical checks include HTTPS/final URL, missing title/meta description, visible contact path, and obvious broken internal links within the crawl limit -- [ ] #7 Crawler failures produce useful dashboard-visible errors without blocking unrelated leads +- [x] #1 Playwright captures desktop and mobile screenshots for the homepage and stores them in Convex File Storage +- [x] #2 Crawler visits a bounded set of relevant subpages: Kontakt, Impressum, Leistungen/Angebot, Über uns/Team when discoverable +- [x] #3 Crawler extracts visible text, page title, meta description, headings, links, phone numbers, email candidates, email source URLs, contact-person context, and CTA/contact-form signals +- [x] #4 Extracted email candidates are classified through the TASK-7 rules: generic business emails are preferred; named emails are accepted only when explicitly published as business contact addresses; no guessed addresses are generated +- [x] #5 Leads discovered by Google Places with a website are automatically scheduled for contact enrichment before they remain in Kontakt fehlt; found usable email updates the lead contact fields and status while preserving phone and source data +- [x] #6 Simple technical checks include HTTPS/final URL, missing title/meta description, visible contact path, and obvious broken internal links within the crawl limit +- [x] #7 Crawler failures produce useful dashboard-visible errors without blocking unrelated leads - - ## Implementation Plan -1. Add Playwright runtime setup compatible with local development and Coolify container deployment. -2. Define crawl limits, viewports, timeout behavior, and allowed same-domain URL rules. -3. Capture homepage desktop/mobile screenshots and upload them to Convex storage. -4. Discover and inspect relevant subpages with bounded depth. -5. Extract visible text, metadata, links, phone numbers, email candidates, contact-person context, CTA/contact-form signals, and source URLs. -6. Normalize and score email candidates, then call the existing TASK-7 lead review/contact qualification path so usable emails update lead contact fields and unqualified named emails do not. -7. Add contact-enrichment run state and dashboard-visible run events/errors for leads that still need manual contact research. -8. Persist extracted raw evidence, technical checks, screenshots, and crawler errors in Convex. +1. Worker A: add pure crawler/extraction helpers with RED/GREEN tests. +2. Worker B: add Convex schema/run/storage persistence with RED/GREEN tests. +3. Worker C: wire lead-discovery scheduling/contact update flow with RED/GREEN tests. +4. Worker D: add dashboard-visible enrichment state/error UI with RED/GREEN tests where practical. +5. Orchestrator: run spec review, code-quality review, full verification, and update acceptance criteria without marking Done. ## Implementation Notes Expanded TASK-8 to cover website-based contact enrichment because Google Places does not provide business email fields. This keeps email handling evidence-based and reuses TASK-7 qualification rules instead of guessing addresses. + +Orchestration started on branch codex-task-8-playwright-enrichment. Parallel wave 1 dispatched with gpt-5.3-codex-spark: Worker A owns lib/website-crawler.ts + tests/website-crawler.test.ts; Worker B owns convex/schema.ts + schema tests; Worker C owns Playwright package/runtime docs. All workers instructed to use TDD or config verification and avoid unrelated changes. + +Completed wave 1 foundations: Playwright runtime/docs approved; crawler helper spec+quality approved; Convex enrichment schema/run-type parity spec+quality approved. Wave 2 dispatched with gpt-5.3-codex-spark: Worker D owns convex/websiteEnrichment.ts action/persistence; Worker E owns lead-discovery scheduling integration. Orchestrator remains code-review/integration only. + +2026-06-04: Worker D started implementing convex/websiteEnrichment.ts with unit/source tests for queue/process/persist enrichment flow and Playwright evidence capture. + +2026-06-04: Added TASK-8 source tests for website-enrichment action queue/process/persistence contract and confirmed all assertions pass with existing implementation. + +Worker G retry: moved website enrichment scheduling out of persistDiscoveredLeads into processCampaignRun (returns queue items), scoped startCampaignRun active checks to by_type_and_status campaign running, and added source assertions for this sequencing. + +Implementation complete pending user confirmation. Built Playwright Chromium website enrichment with bounded crawl, desktop/mobile screenshot storage, raw evidence tables, TASK-7 email qualification reuse, post-discovery scheduling, technical checks, and dashboard-visible run events/errors. Final verification passed: pnpm exec tsc -p tsconfig.json; pnpm test (105/105); pnpm lint (0 errors, existing generated BetterAuth warnings only); pnpm exec convex codegen --dry-run --typecheck enable. + +2026-06-04: Updated source tests/README/.env for TASK-8 browser-runtime strategy migration to @sparticuz/chromium-min and TASK8_BROWSER_ASSET_URL deployment expectations. + +Resolved Convex Playwright runtime follow-up: local npx playwright install only populates the developer machine cache, not Convex runtime. Full playwright was replaced with playwright-core + @sparticuz/chromium-min and a required TASK8_BROWSER_ASSET_URL source so Convex no longer relies on /home/sbx_user ms-playwright cache. Verification passed: pnpm exec tsc -p tsconfig.json; pnpm test; pnpm lint (existing generated BetterAuth warnings only); pnpm exec convex codegen --dry-run --typecheck enable. + +TASK-21 runtime cache fix applied to TASK-8 crawler action: stale @sparticuz/chromium-min /tmp cache is invalidated when browser asset source changes, addressing repeated /tmp/chromium cannot execute binary file after x64/arm64 URL changes. + +TASK-8 crawler action now explicitly prepares @sparticuz/chromium-min AL2023 shared libraries for Convex to address /tmp/chromium libnspr4.so missing errors before screenshot/crawl launch. + +TASK-23 extractor improvement applied: website enrichment now extracts published emails from mailto links with query params, common German obfuscations, HTML entities/spaced separators, and footer/impressum/contact contexts while preserving TASK-7 no-guessing rules. + +TASK-24 Bock Rechtsanwaelte follow-up: mailto candidates on real Impressum HTML were found but incorrectly marked non-business due index mismatch in context detection. Fixed mailto business-context detection and email-label contactPerson suppression; captured Bock HTML now yields usable chemnitz@bock-rechtsanwaelte.de. diff --git a/components/campaigns/campaigns-board.tsx b/components/campaigns/campaigns-board.tsx index 2816495..fbad89b 100644 --- a/components/campaigns/campaigns-board.tsx +++ b/components/campaigns/campaigns-board.tsx @@ -267,187 +267,81 @@ export function CampaignsBoard() { ) : ( - <> -
- - - - - - - - - - - - - - - {campaignsSorted.map((campaign) => ( - + {campaignsSorted.map((campaign) => ( + + +
+
+ {campaign.name} + + {formatNiche(campaign)} + +
+ -
+ {campaign.status === "active" ? "Aktiv" : "Pausiert"} + + + - - - - - - - - - - - - - ))} - -
KampagnePLZ / RadiusCadenceLimitsStatusLaufAktionen
-
-

{campaign.name}

-

- {formatNiche(campaign)} -

-
-
-
-

- - {campaign.postalCode} -

-

{campaign.radiusKm} km Umkreis

-
-
- - {recurrenceLabel[campaign.recurrence]} - - -

- Leads: {campaign.maxNewLeadsPerRun} · Audits:{" "} - {campaign.maxAuditsPerRun} -

-
- - {campaign.status === "active" ? "Aktiv" : "Pausiert"} - - -
-

Letzter Lauf: {formatDateTime(campaign.lastRunAt)}

-

Nächster Lauf: {formatDateTime(campaign.nextRunAt)}

-

Run-Status: {statusLabel[campaign.currentRunStatus] ?? campaign.currentRunStatus}

-
-
-
- - - -
-
-
- -
- {campaignsSorted.map((campaign) => ( - - -
-
- {campaign.name} - - {formatNiche(campaign)} - -
- - {campaign.status === "active" ? "Aktiv" : "Pausiert"} - + +
+
+ + {campaign.postalCode}
- + {campaign.radiusKm} km +
+ +
+

Cadence: {recurrenceLabel[campaign.recurrence]}

+

+ Limits: L {campaign.maxNewLeadsPerRun}, A{" "} + {campaign.maxAuditsPerRun} +

+
+
+

Letzter Lauf: {formatDateTime(campaign.lastRunAt)}

+

Nächster Lauf: {formatDateTime(campaign.nextRunAt)}

+

+ Run-Status: {statusLabel[campaign.currentRunStatus] ?? campaign.currentRunStatus} +

+
- -
-
- - {campaign.postalCode} -
- {campaign.radiusKm} km -
- -
-

Cadence: {recurrenceLabel[campaign.recurrence]}

-

- Limits: L {campaign.maxNewLeadsPerRun}, A{" "} - {campaign.maxAuditsPerRun} -

-
-
-

Letzter Lauf: {formatDateTime(campaign.lastRunAt)}

-

Nächster Lauf: {formatDateTime(campaign.nextRunAt)}

-

- Run-Status: {statusLabel[campaign.currentRunStatus] ?? campaign.currentRunStatus} -

-
- -
- - - -
-
- - ))} -
- +
+ + + +
+ +
+ ))} +
)} ); diff --git a/components/leads/leads-review-table.tsx b/components/leads/leads-review-table.tsx index 913c6cd..c1d8c9f 100644 --- a/components/leads/leads-review-table.tsx +++ b/components/leads/leads-review-table.tsx @@ -22,7 +22,7 @@ import { type LeadBlacklistStatus, } from "@/lib/dashboard-model"; import { Button } from "@/components/ui/button"; -import { Card } from "@/components/ui/card"; +import { Card, CardHeader } from "@/components/ui/card"; import { Input } from "@/components/ui/input"; import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "@/components/ui/select"; import { Badge } from "@/components/ui/badge"; @@ -148,59 +148,23 @@ export function LeadsReviewTable() {

Leads prüfen

-
- -
-
- - - - - - - - - - - - - - {leads === undefined ? ( - - - - - - ) : sortedLeads.length === 0 ? ( - - - - - - ) : ( - - {sortedLeads.map((lead) => ( - - ))} - - )} -
Firma / OrtKontakt + QuellePrioritätKontaktstatusQualitätReview-FelderAktionen
-

- Leads werden geladen… -

-
-

- Keine Leads vorhanden. Bitte zuerst eine Kampagne starten - oder importieren. -

-
-
-
-
+
+ {leads === undefined ? ( +

Leads werden geladen…

+ ) : sortedLeads.length === 0 ? ( +

+ Keine Leads vorhanden. Bitte zuerst eine Kampagne starten oder + importieren. +

+ ) : ( + sortedLeads.map((lead) => ( + + )) + )}
{actionMessage ? ( @@ -219,6 +183,7 @@ function LeadReviewRow({ lead: LeadRow; onActionMessage: (value: string) => void; }) { + const [isExpanded, setIsExpanded] = useState(false); const [draft, setDraft] = useState(() => ({ priority: lead.priority, contactStatus: lead.contactStatus, @@ -313,264 +278,290 @@ function LeadReviewRow({ setDraft((current) => ({ ...current, [field]: value })); }; + const detailsId = `lead-review-details-${lead._id}`; + return ( - - -

{lead.companyName}

-

- - {lead.niche ?? "Nische offen"} -

-

- - {location} -

- {lead.address ? ( -

- {lead.address} -

- ) : null} - + + +
+
+
+

+ {lead.companyName} +

+

+ + + {lead.niche ?? "Nische offen"} + +

+

+ + + {location} + +

+
- -

- - - {lead.email || "Keine E-Mail"} - -

- {lead.phone ? ( -

- - {lead.phone} -

- ) : null} -

- Quelle: {contactSourceLabel(lead)} -

- {lead.websiteDomain ? ( -

- Domain: {lead.websiteDomain} -

- ) : null} - +

+ {getLeadPriorityLabel(draft.priority)} +

+
- -

+

+ + + {lead.email || "Keine E-Mail"} + +

+ {lead.phone ? ( +

+ + {lead.phone} +

+ ) : null} +

+ Quelle: {contactSourceLabel(lead)} +

+ {lead.websiteDomain ? ( +

Domain: {lead.websiteDomain}

+ ) : null} +
+
+ + +
+ +
+ + + ); } diff --git a/convex.json b/convex.json new file mode 100644 index 0000000..83a1997 --- /dev/null +++ b/convex.json @@ -0,0 +1,6 @@ +{ + "$schema": "https://raw.githubusercontent.com/get-convex/convex-backend/main/npm-packages/convex/schemas/convex.schema.json", + "node": { + "externalPackages": ["playwright-core", "@sparticuz/chromium-min"] + } +} diff --git a/convex/_generated/api.d.ts b/convex/_generated/api.d.ts index 661d4a7..8902a97 100644 --- a/convex/_generated/api.d.ts +++ b/convex/_generated/api.d.ts @@ -19,6 +19,8 @@ import type * as outreach from "../outreach.js"; import type * as runs from "../runs.js"; import type * as settings from "../settings.js"; import type * as storage from "../storage.js"; +import type * as websiteEnrichment from "../websiteEnrichment.js"; +import type * as websiteEnrichmentAction from "../websiteEnrichmentAction.js"; import type { ApiFromModules, @@ -38,6 +40,8 @@ declare const fullApi: ApiFromModules<{ runs: typeof runs; settings: typeof settings; storage: typeof storage; + websiteEnrichment: typeof websiteEnrichment; + websiteEnrichmentAction: typeof websiteEnrichmentAction; }>; /** diff --git a/convex/domain.ts b/convex/domain.ts index f877b46..fe6d4de 100644 --- a/convex/domain.ts +++ b/convex/domain.ts @@ -84,6 +84,7 @@ export const RUN_TYPES = [ "audit", "outreach", "lifecycle", + "website_enrichment", ] as const; export const RUN_STATUSES = [ "pending", diff --git a/convex/leadDiscovery.ts b/convex/leadDiscovery.ts index 31fa279..0cfb4ca 100644 --- a/convex/leadDiscovery.ts +++ b/convex/leadDiscovery.ts @@ -17,6 +17,7 @@ import { buildLeadDiscoveryLeadRecord, buildLeadDiscoveryCounters, getLeadDiscoveryPriority, + shouldScheduleWebsiteEnrichment, } from "../lib/lead-discovery-run"; import { calculateNextRunAt } from "../lib/campaign-scheduling"; @@ -214,6 +215,11 @@ export const processCampaignRun = internalAction({ skippedDuplicates: number; skippedBlacklisted: number; errors: number; + websiteEnrichmentQueue: Array<{ + leadId: Id<"leads">; + companyName: string; + website: string; + }>; } = await ctx.runMutation(internal.leadDiscovery.persistDiscoveredLeads, { runId: args.runId, campaignId: campaign._id, @@ -223,6 +229,31 @@ export const processCampaignRun = internalAction({ candidates, }); + for (const enrichment of result.websiteEnrichmentQueue) { + await ctx.runMutation(internal.websiteEnrichment.queueLeadEnrichment, { + leadId: enrichment.leadId, + parentRunId: args.runId, + }); + + await ctx.runMutation(internal.leadDiscovery.appendRunEvent, { + runId: args.runId, + level: "info", + message: "Website-Kontaktanreicherung geplant.", + details: [ + { + label: "Unternehmen", + value: enrichment.companyName, + source: "google_places", + }, + { + label: "Website", + value: enrichment.website, + source: "google_places", + }, + ], + }); + } + await ctx.runMutation(internal.leadDiscovery.finishCampaignRun, { runId: args.runId, status: "succeeded", @@ -275,7 +306,9 @@ export const startCampaignRun = internalMutation({ const activeRunning = await ctx.db .query("agentRuns") - .withIndex("by_status", (q) => q.eq("status", "running")) + .withIndex("by_type_and_status", (q) => + q.eq("type", "campaign").eq("status", "running"), + ) .take(1); if (activeRunning.length > 0) { @@ -390,6 +423,11 @@ export const persistDiscoveredLeads = internalMutation({ let skippedDuplicates = 0; let skippedBlacklisted = 0; let errors = 0; + const websiteEnrichmentQueue: Array<{ + leadId: Id<"leads">; + companyName: string; + website: string; + }> = []; for (const candidate of args.candidates) { if (leadsCreated >= args.maxNewLeads) { @@ -556,8 +594,15 @@ export const persistDiscoveredLeads = internalMutation({ lead.duplicateOfLeadId = probableDuplicateLead._id; } - await ctx.db.insert("leads", lead); + const leadId = await ctx.db.insert("leads", lead); leadsCreated += 1; + if (shouldScheduleWebsiteEnrichment(lead)) { + websiteEnrichmentQueue.push({ + leadId, + companyName: lead.companyName, + website: lead.websiteDomain ?? lead.websiteUrl ?? "unbekannt", + }); + } await ctx.db.insert("agentRunEvents", { runId: args.runId, level: "info", @@ -589,6 +634,7 @@ export const persistDiscoveredLeads = internalMutation({ skippedDuplicates, skippedBlacklisted, errors, + websiteEnrichmentQueue, }; }, }); diff --git a/convex/runs.ts b/convex/runs.ts index 2de823c..91f8f8e 100644 --- a/convex/runs.ts +++ b/convex/runs.ts @@ -1,26 +1,17 @@ import { v } from "convex/values"; -import { normalizeListLimit } from "./domain"; +import { + RUN_EVENT_LEVELS, + RUN_STATUSES, + RUN_TYPES, + normalizeListLimit, +} from "./domain"; import { mutation, query } from "./_generated/server"; -const runType = v.union( - v.literal("campaign"), - v.literal("lead_discovery"), - v.literal("audit"), - v.literal("outreach"), - v.literal("lifecycle"), -); -const runStatus = v.union( - v.literal("pending"), - v.literal("running"), - v.literal("succeeded"), - v.literal("failed"), - v.literal("canceled"), -); +const runType = v.union(...RUN_TYPES.map((type) => v.literal(type))); +const runStatus = v.union(...RUN_STATUSES.map((status) => v.literal(status))); const eventLevel = v.union( - v.literal("info"), - v.literal("warning"), - v.literal("error"), + ...RUN_EVENT_LEVELS.map((level) => v.literal(level)), ); export const create = mutation({ @@ -116,6 +107,16 @@ export const list = query({ .take(limit); } + if (args.type) { + const type = args.type; + + return await ctx.db + .query("agentRuns") + .withIndex("by_type", (q) => q.eq("type", type)) + .order("desc") + .take(limit); + } + if (args.status) { const status = args.status; diff --git a/convex/schema.ts b/convex/schema.ts index 4449560..92860f5 100644 --- a/convex/schema.ts +++ b/convex/schema.ts @@ -1,6 +1,11 @@ import { defineSchema, defineTable } from "convex/server"; import { v } from "convex/values"; import { tables as authTables } from "./betterAuth/schema"; +import { + RUN_EVENT_LEVELS, + RUN_STATUSES, + RUN_TYPES, +} from "./domain"; const campaignStatus = v.union(v.literal("active"), v.literal("paused")); const leadPriority = v.union( @@ -75,24 +80,19 @@ const blacklistType = v.union( v.literal("company"), v.literal("google_place_id"), ); -const runType = v.union( - v.literal("campaign"), - v.literal("lead_discovery"), - v.literal("audit"), - v.literal("outreach"), - v.literal("lifecycle"), -); -const runStatus = v.union( - v.literal("pending"), - v.literal("running"), - v.literal("succeeded"), - v.literal("failed"), - v.literal("canceled"), +const websiteEnrichmentPageKind = v.union( + v.literal("homepage"), + v.literal("contact"), + v.literal("impressum"), + v.literal("services"), + v.literal("about"), + v.literal("team"), + v.literal("other"), ); +const runType = v.union(...RUN_TYPES.map((type) => v.literal(type))); +const runStatus = v.union(...RUN_STATUSES.map((status) => v.literal(status))); const runEventLevel = v.union( - v.literal("info"), - v.literal("warning"), - v.literal("error"), + ...RUN_EVENT_LEVELS.map((level) => v.literal(level)), ); const screenshotViewport = v.union(v.literal("desktop"), v.literal("mobile")); const settingsValue = v.union(v.string(), v.number(), v.boolean(), v.null()); @@ -255,6 +255,85 @@ export default defineSchema({ .index("by_auditId_and_viewport", ["auditId", "viewport"]) .index("by_storageId", ["storageId"]), + websiteCrawlPages: defineTable({ + leadId: v.id("leads"), + runId: v.optional(v.id("agentRuns")), + sourceUrl: v.string(), + finalUrl: v.string(), + pageKind: websiteEnrichmentPageKind, + title: v.optional(v.string()), + metaDescription: v.optional(v.string()), + headings: v.array(v.string()), + visibleTextExcerpt: v.optional(v.string()), + hasContactFormSignal: v.boolean(), + hasContactCtaSignal: v.boolean(), + createdAt: v.number(), + }) + .index("by_leadId", ["leadId"]) + .index("by_runId", ["runId"]) + .index("by_leadId_and_createdAt", ["leadId", "createdAt"]), + + websiteCrawlLinks: defineTable({ + leadId: v.id("leads"), + runId: v.optional(v.id("agentRuns")), + pageUrl: v.string(), + href: v.string(), + text: v.optional(v.string()), + isInternal: v.boolean(), + isBroken: v.optional(v.boolean()), + createdAt: v.number(), + }) + .index("by_leadId", ["leadId"]) + .index("by_runId", ["runId"]), + + websiteEmailCandidates: defineTable({ + leadId: v.id("leads"), + runId: v.optional(v.id("agentRuns")), + email: v.string(), + normalizedEmail: v.string(), + emailSource: v.string(), + sourceUrl: v.string(), + contactPerson: v.optional(v.string()), + isBusinessContactAddress: v.boolean(), + isGeneric: v.boolean(), + accepted: v.boolean(), + createdAt: v.number(), + }) + .index("by_leadId", ["leadId"]) + .index("by_normalizedEmail", ["normalizedEmail"]) + .index("by_runId", ["runId"]), + + websiteCrawlScreenshots: defineTable({ + leadId: v.id("leads"), + runId: v.optional(v.id("agentRuns")), + storageId: v.id("_storage"), + viewport: screenshotViewport, + sourceUrl: v.string(), + capturedAt: v.number(), + width: v.number(), + height: v.number(), + mimeType: v.string(), + createdAt: v.number(), + }) + .index("by_leadId", ["leadId"]) + .index("by_runId", ["runId"]) + .index("by_storageId", ["storageId"]), + + websiteTechnicalChecks: defineTable({ + leadId: v.id("leads"), + runId: v.optional(v.id("agentRuns")), + sourceUrl: v.string(), + finalUrl: v.optional(v.string()), + usesHttps: v.boolean(), + missingTitle: v.boolean(), + missingMetaDescription: v.boolean(), + hasVisibleContactPath: v.boolean(), + brokenInternalLinkCount: v.number(), + createdAt: v.number(), + }) + .index("by_leadId", ["leadId"]) + .index("by_runId", ["runId"]), + outreachRecords: defineTable({ leadId: v.id("leads"), auditId: v.optional(v.id("audits")), @@ -309,7 +388,9 @@ export default defineSchema({ updatedAt: v.number(), }) .index("by_status", ["status"]) + .index("by_type", ["type"]) .index("by_type_and_status", ["type", "status"]) + .index("by_type_and_status_and_leadId", ["type", "status", "leadId"]) .index("by_campaignId_and_updatedAt", ["campaignId", "updatedAt"]) .index("by_campaignId_and_status", ["campaignId", "status"]) .index("by_auditId", ["auditId"]), diff --git a/convex/websiteEnrichment.ts b/convex/websiteEnrichment.ts new file mode 100644 index 0000000..10a968d --- /dev/null +++ b/convex/websiteEnrichment.ts @@ -0,0 +1,408 @@ +import { v } from "convex/values"; +import { internal } from "./_generated/api"; +import type { Doc } from "./_generated/dataModel"; +import { internalMutation } from "./_generated/server"; +import { normalizeEmailAddress } from "../lib/lead-discovery-google"; + +const RUN_COUNTER_TEMPLATE = { + leadsFound: 0, + leadsCreated: 0, + auditsCreated: 0, + outreachPrepared: 0, + errors: 0, +}; + +type WebsiteLead = Pick, "_id" | "websiteUrl" | "contactStatus">; +type LeadContactStatus = Doc<"leads">["contactStatus"]; + +export const queueLeadEnrichment = internalMutation({ + args: { + leadId: v.id("leads"), + parentRunId: v.optional(v.id("agentRuns")), + }, + returns: v.union(v.id("agentRuns"), v.null()), + handler: async (ctx, args) => { + const now = Date.now(); + const lead = await ctx.db.get(args.leadId); + + if (!lead || !lead.websiteUrl) { + return null; + } + + const activePending = await ctx.db + .query("agentRuns") + .withIndex("by_type_and_status_and_leadId", (q) => + q + .eq("type", "website_enrichment") + .eq("status", "pending") + .eq("leadId", args.leadId), + ) + .take(1); + + const activeRunning = await ctx.db + .query("agentRuns") + .withIndex("by_type_and_status_and_leadId", (q) => + q + .eq("type", "website_enrichment") + .eq("status", "running") + .eq("leadId", args.leadId), + ) + .take(1); + + if (activePending.length > 0) { + return activePending[0]._id; + } + if (activeRunning.length > 0) { + return activeRunning[0]._id; + } + + const runId = await ctx.db.insert("agentRuns", { + type: "website_enrichment", + leadId: args.leadId, + status: "pending", + counters: RUN_COUNTER_TEMPLATE, + currentStep: "website_enrichment", + createdAt: now, + updatedAt: now, + }); + + await ctx.db.insert("agentRunEvents", { + runId, + level: "info", + message: "Website-Enrichment wurde in die Warteschlange gesetzt.", + details: [ + { label: "Lead", value: args.leadId }, + ...(args.parentRunId + ? [{ label: "Parent-Run", value: args.parentRunId }] + : []), + ], + createdAt: now, + }); + + await ctx.scheduler.runAfter( + 0, + internal.websiteEnrichmentAction.processLeadEnrichment, + { + runId, + }, + ); + + return runId; + }, +}); + +export const startLeadEnrichmentRun = internalMutation({ + args: { runId: v.id("agentRuns") }, + handler: async (ctx, args): Promise< + { lead: WebsiteLead } | null + > => { + const now = Date.now(); + const run = await ctx.db.get(args.runId); + + if (!run || run.type !== "website_enrichment" || run.status !== "pending") { + return null; + } + + if (!run.leadId) { + await ctx.db.patch(args.runId, { + status: "failed", + currentStep: "website_enrichment", + errorSummary: "Der Lauf hat keine Lead-ID.", + updatedAt: now, + finishedAt: now, + }); + await ctx.db.insert("agentRunEvents", { + runId: args.runId, + level: "error", + message: + "Website-Enrichment konnte nicht gestartet werden: Keine Lead-ID.", + details: [{ label: "Lead-ID", value: run.leadId ?? "unbekannt" }], + createdAt: now, + }); + return null; + } + + const lead = await ctx.db.get(run.leadId); + + if (!lead) { + await ctx.db.patch(args.runId, { + status: "failed", + currentStep: "website_enrichment", + errorSummary: "Lead fehlt oder besitzt keine Website.", + updatedAt: now, + finishedAt: now, + }); + await ctx.db.insert("agentRunEvents", { + runId: args.runId, + level: "error", + message: + "Website-Enrichment konnte nicht gestartet werden: Kein Lead mit Website-URL.", + details: [{ label: "Lead-ID", value: run.leadId }], + createdAt: now, + }); + return null; + } + + if (!lead.websiteUrl) { + await ctx.db.patch(args.runId, { + status: "failed", + currentStep: "website_enrichment", + errorSummary: "Lead fehlt oder besitzt keine Website.", + updatedAt: now, + finishedAt: now, + }); + await ctx.db.insert("agentRunEvents", { + runId: args.runId, + level: "error", + message: + "Website-Enrichment konnte nicht gestartet werden: Kein Lead mit Website-URL.", + details: [{ label: "Lead-ID", value: lead._id }], + createdAt: now, + }); + await ctx.db.patch(lead._id, { + contactStatusReason: + "Website-URL fehlt für das Website-Enrichment.", + updatedAt: now, + }); + return null; + } + + await ctx.db.patch(args.runId, { + status: "running", + currentStep: "website_enrichment", + startedAt: now, + updatedAt: now, + }); + + await ctx.db.insert("agentRunEvents", { + runId: args.runId, + level: "info", + message: "Website-Enrichment gestartet.", + details: [{ label: "Lead", value: lead._id }], + createdAt: now, + }); + + return { + lead: { + _id: lead._id, + websiteUrl: lead.websiteUrl, + contactStatus: lead.contactStatus, + }, + }; + }, +}); + +export const persistLeadEnrichmentResult = internalMutation({ + args: { + runId: v.id("agentRuns"), + leadId: v.id("leads"), + pages: v.array( + v.object({ + sourceUrl: v.string(), + finalUrl: v.string(), + pageKind: v.union( + v.literal("homepage"), + v.literal("contact"), + v.literal("impressum"), + v.literal("services"), + v.literal("about"), + v.literal("team"), + v.literal("other"), + ), + title: v.optional(v.string()), + metaDescription: v.optional(v.string()), + headings: v.array(v.string()), + visibleTextExcerpt: v.optional(v.string()), + hasContactFormSignal: v.boolean(), + hasContactCtaSignal: v.boolean(), + }), + ), + links: v.array( + v.object({ + pageUrl: v.string(), + href: v.string(), + text: v.optional(v.string()), + isInternal: v.boolean(), + isBroken: v.optional(v.boolean()), + }), + ), + emailCandidates: v.array( + v.object({ + email: v.string(), + normalizedEmail: v.string(), + emailSource: v.string(), + sourceUrl: v.string(), + contactPerson: v.optional(v.string()), + isBusinessContactAddress: v.boolean(), + isGeneric: v.boolean(), + accepted: v.boolean(), + }), + ), + screenshots: v.array( + v.object({ + storageId: v.id("_storage"), + viewport: v.union(v.literal("desktop"), v.literal("mobile")), + sourceUrl: v.string(), + capturedAt: v.number(), + width: v.number(), + height: v.number(), + mimeType: v.string(), + }), + ), + technicalChecks: v.array( + v.object({ + sourceUrl: v.string(), + finalUrl: v.optional(v.string()), + usesHttps: v.boolean(), + missingTitle: v.boolean(), + missingMetaDescription: v.boolean(), + hasVisibleContactPath: v.boolean(), + brokenInternalLinkCount: v.number(), + }), + ), + }, + handler: async (ctx, args) => { + const createdAt = Date.now(); + + for (const page of args.pages) { + await ctx.db.insert("websiteCrawlPages", { + ...page, + leadId: args.leadId, + runId: args.runId, + createdAt, + }); + } + + for (const link of args.links) { + await ctx.db.insert("websiteCrawlLinks", { + ...link, + leadId: args.leadId, + runId: args.runId, + createdAt, + }); + } + + for (const candidate of args.emailCandidates) { + await ctx.db.insert("websiteEmailCandidates", { + ...candidate, + leadId: args.leadId, + runId: args.runId, + createdAt, + }); + } + + for (const screenshot of args.screenshots) { + await ctx.db.insert("websiteCrawlScreenshots", { + ...screenshot, + leadId: args.leadId, + runId: args.runId, + createdAt, + }); + } + + for (const checks of args.technicalChecks) { + await ctx.db.insert("websiteTechnicalChecks", { + ...checks, + leadId: args.leadId, + runId: args.runId, + createdAt, + }); + } + }, +}); + +export const finishLeadEnrichmentRun = internalMutation({ + args: { + runId: v.id("agentRuns"), + status: v.union( + v.literal("succeeded"), + v.literal("failed"), + v.literal("canceled"), + ), + currentStep: v.optional(v.string()), + errorSummary: v.optional(v.string()), + errors: v.optional(v.number()), + }, + handler: async (ctx, args) => { + const now = Date.now(); + + await ctx.db.patch(args.runId, { + status: args.status, + updatedAt: now, + finishedAt: now, + currentStep: args.currentStep ?? "website_enrichment", + errorSummary: args.errorSummary, + counters: { + leadsFound: 1, + leadsCreated: 0, + auditsCreated: 0, + outreachPrepared: 0, + errors: args.errors ?? 0, + }, + }); + }, +}); + +export const patchLeadFromWebsiteEnrichment = internalMutation({ + args: { + leadId: v.id("leads"), + email: v.optional(v.string()), + emailSource: v.optional(v.string()), + contactPerson: v.optional(v.string()), + currentContactStatus: v.union( + v.literal("new"), + v.literal("missing_contact"), + v.literal("audit_ready"), + v.literal("outreach_ready"), + v.literal("contacted"), + v.literal("replied"), + v.literal("do_not_contact"), + ), + contactStatusReason: v.optional(v.string()), + }, + handler: async (ctx, args) => { + const lead = await ctx.db.get(args.leadId); + if (!lead) { + return null; + } + + type LeadPatch = { + email?: string; + normalizedEmail?: string; + emailSource?: string; + contactPerson?: string; + contactStatus?: LeadContactStatus; + contactStatusReason?: string; + updatedAt: number; + }; + + const patch: LeadPatch = { + updatedAt: Date.now(), + }; + + if (args.email && args.emailSource) { + const normalized = normalizeEmailAddress(args.email); + if (normalized) { + patch.email = normalized; + patch.normalizedEmail = normalized; + patch.emailSource = args.emailSource; + } + } + + if (args.contactPerson) { + patch.contactPerson = args.contactPerson; + } + + if (args.contactStatusReason !== undefined) { + patch.contactStatusReason = args.contactStatusReason; + } else if (args.email && args.currentContactStatus === "missing_contact") { + patch.contactStatus = "new"; + } + + if (Object.keys(patch).length > 1) { + await ctx.db.patch(args.leadId, patch); + } + + return args.leadId; + }, +}); diff --git a/convex/websiteEnrichmentAction.ts b/convex/websiteEnrichmentAction.ts new file mode 100644 index 0000000..d17991e --- /dev/null +++ b/convex/websiteEnrichmentAction.ts @@ -0,0 +1,725 @@ +"use node"; + +import type { Browser, BrowserContext } from "playwright-core"; +import { createHash } from "node:crypto"; +import { access, readFile, rm, writeFile } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import path from "node:path"; +import { v } from "convex/values"; +import { + buildTechnicalChecks, + discoverRelevantSubpageUrls, + extractContactSignalsFromHtmlLikeText, + isSameRegistrableHostishDomain, + normalizeCrawlUrl, +} from "../lib/website-crawler"; +import { + getUsableContactEmailFromEntries, + normalizeEmailAddress, +} from "../lib/lead-discovery-google"; +import { api, internal } from "./_generated/api"; +import type { Doc, Id } from "./_generated/dataModel"; +import { internalAction, type ActionCtx } from "./_generated/server"; + +const DEFAULT_CRAWL_TIMEOUT_MS = 60_000; +const DEFAULT_CRAWL_MAX_PAGES = 5; +const MAX_PERSISTED_LINKS = 120; +const MAX_PERSISTED_EMAIL_CANDIDATES = 40; +const SCREENSHOT_MIME_TYPE = "image/png"; +const CHROMIUM_SOURCE_MARKER_FILE = path.join(tmpdir(), "chromium-source.sha256"); +const CHROMIUM_EXECUTABLE_PATH = path.join(tmpdir(), "chromium"); +const CHROMIUM_PACK_PATH = path.join(tmpdir(), "chromium-pack"); +const GENERIC_EMAIL_LOCALS = new Set([ + "info", + "kontakt", + "contact", + "sales", + "team", + "support", + "service", + "hello", + "marketing", + "admin", + "office", + "impressum", + "post", +]); +const CHROMIUM_EXECUTABLE_SOURCE_ENV_VARS = [ + "TASK8_BROWSER_ASSET_URL", + "TASK8_CHROMIUM_EXECUTABLE_URL", + "TASK8_CHROMIUM_EXECUTABLE", +]; + +type EnrichmentPageKind = + | "homepage" + | "contact" + | "impressum" + | "services" + | "about" + | "team" + | "other"; +type CrawlPageLink = { + href: string; + text: string; + isInternal: boolean; +}; +type PersistedCrawlLink = CrawlPageLink & { + pageUrl: string; +}; +type PageResult = { + sourceUrl: string; + finalUrl: string; + pageKind: EnrichmentPageKind; + title: string; + metaDescription: string; + headings: string[]; + visibleText: string; + links: CrawlPageLink[]; + emailCandidates: Array<{ + email: string; + emailSource: string; + contactPerson: string | null; + isBusinessContactAddress: boolean; + isGeneric: boolean; + sourceUrl: string; + accepted: boolean; + normalizedEmail: string; + }>; + hasContactFormSignal: boolean; + hasContactCtaSignal: boolean; +}; +type StoredScreenshot = { + storageId: Id<"_storage">; + viewport: "desktop" | "mobile"; + sourceUrl: string; + capturedAt: number; + width: number; + height: number; + mimeType: string; +}; + +type WebsiteLead = Pick< + Doc<"leads">, + "_id" | "websiteUrl" | "contactStatus" +>; +type StartedLead = { + lead: WebsiteLead; +}; + +type ServerlessChromiumModule = { + args: string[]; + executablePath: (input?: string) => Promise; + inflate: (filePath: string) => Promise; + setupLambdaEnvironment: (baseLibPath: string) => void; +}; + +function messageFromError(error: unknown) { + return error instanceof Error ? error.message : String(error); +} + +function readPositiveIntEnv(key: string, fallback: number) { + const raw = process.env[key]?.trim(); + if (!raw) { + return fallback; + } + const parsed = Number.parseInt(raw, 10); + return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback; +} + +function crawlTimeoutMs() { + return readPositiveIntEnv("TASK8_CRAWL_TIMEOUT_MS", DEFAULT_CRAWL_TIMEOUT_MS); +} + +function crawlMaxPages() { + return Math.max( + 1, + Math.min( + DEFAULT_CRAWL_MAX_PAGES, + readPositiveIntEnv("TASK8_CRAWL_MAX_PAGES", DEFAULT_CRAWL_MAX_PAGES), + ), + ); +} + +function makePageKind(url: string, rootUrl: string): EnrichmentPageKind { + const normalizedRoot = normalizeCrawlUrl(rootUrl); + if (!normalizedRoot) { + return "other"; + } + + const homepagePath = new URL(normalizedRoot).pathname.replace(/\/$/, "") || "/"; + let pageUrl: string; + try { + pageUrl = new URL(url).pathname.toLowerCase(); + } catch { + return "other"; + } + + if (pageUrl === homepagePath || pageUrl === homepagePath.replace(/\/$/, "")) { + return "homepage"; + } + + const normalizedPath = pageUrl.toLowerCase(); + if (/(?:^|\/)(kontakt|contact)(?:[-/]|$)/.test(normalizedPath)) { + return "contact"; + } + if (/(?:^|\/)(impressum|imprint)(?:[-/]|$)/.test(normalizedPath)) { + return "impressum"; + } + if (/(?:^|\/)(leistungen|angebot|services?)(?:[-/]|$)/.test(normalizedPath)) { + return "services"; + } + if (/(?:^|\/)(ueber|über|about|team)(?:[-/]|$)/.test(normalizedPath)) { + return "about"; + } + + return "other"; +} + +function trimExcerpt(value: string) { + return value.replace(/\s+/g, " ").trim().slice(0, 1200); +} + +function isGenericBusinessEmail(email: string) { + const local = email.split("@")[0]?.toLowerCase() ?? ""; + const base = local.split("+")[0] ?? ""; + return GENERIC_EMAIL_LOCALS.has(base); +} + +async function loadPlaywrightModules() { + const [playwrightCore, chromiumPackage] = await Promise.all([ + import("playwright-core"), + import("@sparticuz/chromium-min"), + ]); + return { + playwrightCore, + serverlessChromium: { + args: chromiumPackage.default.args, + executablePath: chromiumPackage.default.executablePath, + inflate: chromiumPackage.inflate, + setupLambdaEnvironment: chromiumPackage.setupLambdaEnvironment, + } as ServerlessChromiumModule, + }; +} + +function getChromiumExecutableSource() { + for (const key of CHROMIUM_EXECUTABLE_SOURCE_ENV_VARS) { + const value = process.env[key]?.trim(); + if (value) { + return value; + } + } + + return null; +} + +function getChromiumSourceMarker(source: string) { + return createHash("sha256").update(source).digest("hex"); +} + +async function clearChromiumCacheForSourceMismatch(executableSource: string) { + const nextMarker = getChromiumSourceMarker(executableSource); + const marker = await readFile(CHROMIUM_SOURCE_MARKER_FILE, "utf8").catch(() => null); + if ((marker ?? "").trim() === nextMarker) { + return; + } + + await Promise.all([ + rm(CHROMIUM_EXECUTABLE_PATH, { force: true, recursive: true }), + rm(CHROMIUM_PACK_PATH, { force: true, recursive: true }), + ]); +} + +async function resolveChromiumExecutablePath( + chromium: ServerlessChromiumModule, +) { + const executableSource = getChromiumExecutableSource(); + if (!executableSource) { + throw new Error( + `Set TASK8_BROWSER_ASSET_URL (or legacy TASK8_CHROMIUM_EXECUTABLE_URL / TASK8_CHROMIUM_EXECUTABLE) to configure the Chromium source; no source is configured.`, + ); + } + + await clearChromiumCacheForSourceMismatch(executableSource); + const executablePath = await chromium.executablePath(executableSource); + await writeFile( + CHROMIUM_SOURCE_MARKER_FILE, + getChromiumSourceMarker(executableSource), + ); + + return executablePath; +} + +async function captureHomepageScreenshot( + ctx: ActionCtx, + context: BrowserContext, + homepageUrl: string, + viewport: "desktop" | "mobile", + timeoutMs: number, +) { + const page = await context.newPage(); + try { + await page.goto(homepageUrl, { + waitUntil: "domcontentloaded", + timeout: timeoutMs, + }); + const sourceUrl = page.url(); + const screenshot = await page.screenshot({ + fullPage: true, + type: "png", + }); + const storageId = await ctx.storage.store( + new Blob([new Uint8Array(screenshot)], { type: SCREENSHOT_MIME_TYPE }), + ); + const viewportSize = page.viewportSize() ?? { width: 0, height: 0 }; + + return { + storageId, + viewport, + sourceUrl, + capturedAt: Date.now(), + width: viewportSize.width, + height: viewportSize.height, + mimeType: SCREENSHOT_MIME_TYPE, + } satisfies StoredScreenshot; + } finally { + await page.close(); + } +} + +async function crawlPage( + context: BrowserContext, + targetUrl: string, + rootUrl: string, + timeoutMs: number, +) { + const page = await context.newPage(); + try { + const response = await page.goto(targetUrl, { + waitUntil: "domcontentloaded", + timeout: timeoutMs, + }); + if (!response) { + return null; + } + + const finalUrl = page.url(); + const title = await page.title().catch(() => ""); + const metaDescription = await page + .evaluate(() => { + const meta = document.querySelector( + "meta[name='description']", + ) as HTMLMetaElement | null; + return meta?.content ?? ""; + }) + .catch(() => ""); + const content = await page.content(); + const signals = extractContactSignalsFromHtmlLikeText(content); + const headings = await page + .evaluate(() => + Array.from(document.querySelectorAll("h1, h2, h3")) + .map((element) => element.textContent?.trim() ?? "") + .filter((heading) => heading.length > 0), + ) + .catch(() => []); + const visibleText = await page.evaluate(() => { + return document.body?.innerText ?? ""; + }); + const rawLinks = await page + .evaluate(() => + Array.from(document.querySelectorAll("a[href]")).map((anchor) => ({ + href: anchor.getAttribute("href") ?? "", + text: anchor.textContent?.trim() ?? "", + })), + ) + .catch(() => []); + + const normalizedLinks = rawLinks + .map((link) => { + const normalizedHref = normalizeCrawlUrl(link.href, finalUrl); + if (!normalizedHref) { + return null; + } + return { + href: normalizedHref, + text: link.text, + isInternal: isSameRegistrableHostishDomain(normalizedHref, rootUrl), + }; + }) + .filter( + (entry): entry is { href: string; text: string; isInternal: boolean } => + entry !== null, + ); + + const emailCandidates = signals.emailCandidates + .map((entry) => { + const normalizedEmail = normalizeEmailAddress(entry.email); + if (!normalizedEmail) { + return null; + } + return { + email: normalizedEmail, + emailSource: finalUrl, + contactPerson: entry.contactPerson ?? null, + isBusinessContactAddress: entry.isBusinessContactAddress, + isGeneric: isGenericBusinessEmail(normalizedEmail), + sourceUrl: finalUrl, + accepted: false, + normalizedEmail, + }; + }) + .filter((entry): entry is NonNullable => entry !== null); + + return { + sourceUrl: finalUrl, + finalUrl, + pageKind: makePageKind(targetUrl, rootUrl), + title, + metaDescription, + headings, + visibleText, + links: normalizedLinks, + emailCandidates, + hasContactFormSignal: signals.hasContactFormSignal, + hasContactCtaSignal: signals.hasContactCtaSignal, + } satisfies PageResult; + } finally { + await page.close(); + } +} + +function deduplicateLeadEmailCandidates( + candidates: PageResult["emailCandidates"], +) { + const unique = new Map(); + + for (const candidate of candidates) { + if (!unique.has(candidate.normalizedEmail)) { + unique.set(candidate.normalizedEmail, candidate); + } + } + + return [...unique.values()]; +} + +function deduplicateCrawlLinks(links: PersistedCrawlLink[]) { + const unique = new Map(); + + for (const link of links) { + if (!unique.has(link.href)) { + unique.set(link.href, link); + } + } + + return [...unique.values()]; +} + +export const processLeadEnrichment = internalAction({ + args: { runId: v.id("agentRuns") }, + handler: async (ctx, args) => { + let started: StartedLead | null = null; + const runId = args.runId; + let browser: Browser | null = null; + let desktopContext: BrowserContext | null = null; + let mobileContext: BrowserContext | null = null; + + try { + started = await ctx.runMutation(internal.websiteEnrichment.startLeadEnrichmentRun, { + runId, + }); + + if (!started) { + return null; + } + + const rootUrl = normalizeCrawlUrl(started.lead.websiteUrl); + if (!rootUrl) { + await ctx.runMutation(internal.websiteEnrichment.finishLeadEnrichmentRun, { + runId, + status: "failed", + currentStep: "website_enrichment", + errorSummary: "Ungültige Website-URL.", + errors: 1, + }); + await ctx.runMutation(api.runs.appendEvent, { + runId, + level: "error", + message: "Website-Enrichment fehlgeschlagen: Ungültige Website-URL.", + details: [{ label: "Lead", value: started.lead._id }], + }); + await ctx.runMutation(internal.websiteEnrichment.patchLeadFromWebsiteEnrichment, { + leadId: started.lead._id, + currentContactStatus: started.lead.contactStatus, + contactStatusReason: + "Website-Enrichment fehlgeschlagen: Ungültige Website-URL.", + }); + return null; + } + + const timeoutMs = crawlTimeoutMs(); + const maxPages = crawlMaxPages(); + + const { playwrightCore, serverlessChromium } = + await loadPlaywrightModules(); + const executablePath = await resolveChromiumExecutablePath( + serverlessChromium, + ); + + const prepareChromiumSharedLibraries = async ( + chromiumRuntime: ServerlessChromiumModule, + ) => { + const runtimeArchivePath = path.join( + CHROMIUM_PACK_PATH, + "al2023.tar.br", + ); + await access(runtimeArchivePath).catch(() => { + throw new Error( + `AL2023 shared library archive not found at ${runtimeArchivePath}; cannot prepare Chromium shared libraries.`, + ); + }); + + await chromiumRuntime.inflate(runtimeArchivePath); + chromiumRuntime.setupLambdaEnvironment(path.join(tmpdir(), "al2023", "lib")); + }; + + await prepareChromiumSharedLibraries(serverlessChromium); + browser = await playwrightCore.chromium.launch({ + headless: true, + executablePath, + args: serverlessChromium.args, + }); + const { devices } = playwrightCore; + desktopContext = await browser.newContext({ + ...devices["Desktop Chrome"], + }); + mobileContext = await browser.newContext({ + ...devices["iPhone 11"], + }); + + const homepage = await crawlPage(desktopContext, rootUrl, rootUrl, timeoutMs); + if (!homepage) { + throw new Error("Homepage konnte nicht geladen werden."); + } + + const requestedPages = discoverRelevantSubpageUrls( + homepage.links.map((link) => link.href), + rootUrl, + ); + const crawlTargets = requestedPages.slice(0, maxPages); + const crawledPages: PageResult[] = [homepage]; + + for (const pageUrl of crawlTargets.slice(1)) { + const crawled = await crawlPage(desktopContext, pageUrl, rootUrl, timeoutMs); + if (crawled) { + crawledPages.push(crawled); + } + } + + const allLinks: PersistedCrawlLink[] = crawledPages.flatMap((page) => + page.links.map((link) => ({ + ...link, + pageUrl: page.finalUrl, + })), + ); + const internalLinks = allLinks.filter((link) => link.isInternal); + const uniqueInternalLinks = [...new Set(internalLinks.map((link) => link.href))]; + + const checkMap = new Map< + string, + { status: number | null; isBroken: boolean } + >(); + + for (const href of uniqueInternalLinks.slice(0, 30)) { + try { + const response = await desktopContext.request.get(href, { + timeout: Math.max(1_000, timeoutMs - 1_000), + }); + const status = response.status(); + checkMap.set(href, { + status, + isBroken: status < 200 || status >= 400, + }); + } catch { + checkMap.set(href, { + status: null, + isBroken: true, + }); + } + } + + const desktopScreenshot = await captureHomepageScreenshot( + ctx, + desktopContext, + homepage.finalUrl, + "desktop", + timeoutMs, + ); + const mobileScreenshot = await captureHomepageScreenshot( + ctx, + mobileContext, + homepage.finalUrl, + "mobile", + timeoutMs, + ); + + const technicalInput = buildTechnicalChecks({ + rootUrl, + finalUrl: homepage.finalUrl, + title: homepage.title, + metaDescription: homepage.metaDescription, + visibleText: homepage.visibleText, + checkedUrls: crawledPages.map((page) => page.finalUrl), + links: allLinks.map((link) => { + const check = checkMap.get(link.href); + return { + href: link.href, + status: check?.status ?? undefined, + statusCode: check?.status ?? undefined, + isBroken: check?.isBroken, + }; + }), + }); + + const validCandidates = deduplicateLeadEmailCandidates( + crawledPages.flatMap((page) => page.emailCandidates), + ); + const persistedLinks = deduplicateCrawlLinks(allLinks).slice( + 0, + MAX_PERSISTED_LINKS, + ); + const persistedCandidates = validCandidates.slice( + 0, + MAX_PERSISTED_EMAIL_CANDIDATES, + ); + const usable = getUsableContactEmailFromEntries( + validCandidates.map((candidate) => ({ + email: candidate.email, + emailSource: candidate.emailSource, + contactPerson: candidate.contactPerson, + isBusinessContactAddress: candidate.isBusinessContactAddress, + })), + ); + + await ctx.runMutation(internal.websiteEnrichment.persistLeadEnrichmentResult, { + runId, + leadId: started.lead._id, + pages: crawledPages.map((page) => ({ + sourceUrl: page.sourceUrl, + finalUrl: page.finalUrl, + pageKind: page.pageKind, + title: page.title, + metaDescription: page.metaDescription, + headings: page.headings, + visibleTextExcerpt: trimExcerpt(page.visibleText), + hasContactFormSignal: page.hasContactFormSignal, + hasContactCtaSignal: page.hasContactCtaSignal, + })), + links: persistedLinks.map((link) => ({ + pageUrl: link.pageUrl, + href: link.href, + text: link.text, + isInternal: link.isInternal, + isBroken: checkMap.get(link.href)?.isBroken, + })), + emailCandidates: persistedCandidates.map((candidate) => ({ + email: candidate.email, + normalizedEmail: candidate.normalizedEmail, + emailSource: candidate.emailSource, + sourceUrl: candidate.sourceUrl, + contactPerson: candidate.contactPerson ?? undefined, + isBusinessContactAddress: candidate.isBusinessContactAddress, + isGeneric: candidate.isGeneric, + accepted: + usable !== null && candidate.normalizedEmail === usable.email, + })), + screenshots: [ + ...(desktopScreenshot ? [desktopScreenshot] : []), + ...(mobileScreenshot ? [mobileScreenshot] : []), + ], + technicalChecks: [ + { + sourceUrl: homepage.sourceUrl, + finalUrl: homepage.finalUrl, + usesHttps: technicalInput.https, + missingTitle: technicalInput.missingTitle, + missingMetaDescription: technicalInput.missingMetaDescription, + hasVisibleContactPath: technicalInput.hasVisibleContactPath, + brokenInternalLinkCount: technicalInput.brokenInternalLinks.length, + }, + ], + }); + + if (usable) { + await ctx.runMutation(internal.websiteEnrichment.patchLeadFromWebsiteEnrichment, { + leadId: started.lead._id, + email: usable.email, + emailSource: usable.emailSource ?? undefined, + contactPerson: usable.contactPerson ?? undefined, + currentContactStatus: started.lead.contactStatus, + }); + } else { + await ctx.runMutation(internal.websiteEnrichment.patchLeadFromWebsiteEnrichment, { + leadId: started.lead._id, + currentContactStatus: started.lead.contactStatus, + contactStatusReason: + "Kein verwertbarer Kontakt auf der Website gefunden.", + }); + } + + await ctx.runMutation(internal.websiteEnrichment.finishLeadEnrichmentRun, { + runId, + status: "succeeded", + currentStep: "website_enrichment", + errors: 0, + }); + + await ctx.runMutation(api.runs.appendEvent, { + runId, + level: "info", + message: usable + ? "Website-Enrichment erfolgreich mit nutzbarer E-Mail abgeschlossen." + : "Website-Enrichment abgeschlossen, aber ohne nutzbare E-Mail.", + }); + + return runId; + } catch (error) { + const errorSummary = messageFromError(error); + + await ctx.runMutation(internal.websiteEnrichment.finishLeadEnrichmentRun, { + runId, + status: "failed", + currentStep: "website_enrichment", + errorSummary, + errors: 1, + }); + + await ctx.runMutation(api.runs.appendEvent, { + runId, + level: "error", + message: "Website-Enrichment fehlgeschlagen.", + details: [ + { label: "Fehler", value: errorSummary, source: "website_enrichment" }, + ], + }); + + if (started) { + await ctx.runMutation(internal.websiteEnrichment.patchLeadFromWebsiteEnrichment, { + leadId: started.lead._id, + currentContactStatus: started.lead.contactStatus, + contactStatusReason: `Website-Enrichment fehlgeschlagen: ${errorSummary}`, + }); + } + + return null; + } finally { + if (desktopContext) { + await desktopContext.close(); + } + if (mobileContext) { + await mobileContext.close(); + } + if (browser) { + await browser.close(); + } + } + }, +}); diff --git a/lib/lead-discovery-run.ts b/lib/lead-discovery-run.ts index 455563b..81d9d44 100644 --- a/lib/lead-discovery-run.ts +++ b/lib/lead-discovery-run.ts @@ -21,6 +21,21 @@ type LeadDiscoveryContactInput = { usableEmail?: string | null; }; +export type LeadDiscoveryContactStatus = + | "new" + | "missing_contact" + | "audit_ready" + | "outreach_ready" + | "contacted" + | "replied" + | "do_not_contact"; + +type WebsiteEnrichmentScheduleInput = { + websiteUrl?: string | null; + websiteDomain?: string | null; + contactStatus: LeadDiscoveryContactStatus; +}; + export type LeadDiscoveryPriority = "high" | "medium" | "low" | "defer" | "blocked"; type LeadDiscoveryPriorityInput = { @@ -39,7 +54,7 @@ type LeadDiscoveryLeadRecordInput 0 ? value : undefined; } @@ -91,6 +106,16 @@ export function getLeadDiscoveryContactStatus( return "missing_contact"; } +export function shouldScheduleWebsiteEnrichment( + input: WebsiteEnrichmentScheduleInput, +) { + const hasWebsiteData = + optionalString(input.websiteUrl) !== undefined || + optionalString(input.websiteDomain) !== undefined; + + return input.contactStatus === "missing_contact" && hasWebsiteData; +} + export function buildLeadDiscoveryLeadRecord< TCampaignId extends string, TRunId extends string, diff --git a/lib/website-crawler.ts b/lib/website-crawler.ts new file mode 100644 index 0000000..2a0e684 --- /dev/null +++ b/lib/website-crawler.ts @@ -0,0 +1,605 @@ +import { normalizeEmailAddress } from "./lead-discovery-google"; + +const HTTP_SCHEMES = new Set(["http:", "https:"]); + +const RELEVANT_PATH_PATTERNS = [ + /(?:^|\/)(kontakt|contact)(?:[-/]|$)/i, + /(?:^|\/)(impressum|imprint)(?:[-/]|$)/i, + /(?:^|\/)(leistungen|angebot|services?)(?:[-/]|$)/i, + /(?:^|\/)(ueber|über|team|about)(?:[-/]|$)/i, +]; + +const CONTACT_CONTEXT_KEYWORDS = [ + "ansprechpartner", + "kontakt", + "e-mail", + "email", + "team", + "impressum", + "geschäftsführung", + "imprint", + "footer", + "anfrage", +]; + +const GENERIC_BUSINESS_LOCALS = new Set([ + "info", + "kontakt", + "contact", + "office", + "hello", + "sales", + "support", + "service", + "team", + "post", +]); + +export type WebsiteCrawlEmailCandidate = { + email: string; + emailSource: string | null; + contactPerson: string | null; + isBusinessContactAddress: boolean; +}; + +export type WebsiteCrawlContactSignals = { + visibleText: string; + phoneNumbers: string[]; + emailCandidates: WebsiteCrawlEmailCandidate[]; + hasContactFormSignal: boolean; + hasContactCtaSignal: boolean; +}; + +export type TechnicalChecksInput = { + rootUrl?: string | null; + finalUrl?: string | null; + title?: string | null; + metaDescription?: string | null; + visibleText?: string | null; + checkedUrls?: string[]; + links?: Array< + | string + | { + href?: string; + status?: number; + statusCode?: number; + isBroken?: boolean; + } + >; +}; + +export type WebsiteTechnicalChecks = { + https: boolean; + finalUrl: string; + missingTitle: boolean; + missingMetaDescription: boolean; + hasVisibleContactPath: boolean; + brokenInternalLinks: string[]; +}; + +function stripWww(host: string) { + return host.replace(/^www\./i, ""); +} + +function toLowerHost(value: string) { + try { + return new URL(value).hostname.toLowerCase(); + } catch { + return ""; + } +} + +export function normalizeCrawlUrl(input?: string | null, base?: string) { + if (!input) { + return null; + } + + const trimmed = input.trim(); + if (!trimmed) { + return null; + } + + if (!base && (trimmed.startsWith("//") || !trimmed.includes("://"))) { + return null; + } + + let parsed: URL; + try { + parsed = new URL(trimmed, base); + } catch { + return null; + } + + if (!HTTP_SCHEMES.has(parsed.protocol)) { + return null; + } + + const normalizedHost = stripWww(parsed.hostname.toLowerCase()); + + const search = parsed.search; + const path = parsed.pathname || "/"; + + return `${parsed.protocol}//${normalizedHost}${parsed.port ? `:${parsed.port}` : ""}${path}${search}`; +} + +export function isSameRegistrableHostishDomain( + candidateUrl: string, + rootUrl: string, +) { + const root = normalizeCrawlUrl(rootUrl) ?? undefined; + const candidate = normalizeCrawlUrl(candidateUrl, root); + + if (!candidate || !root) { + return false; + } + + const candidateHost = stripWww(toLowerHost(candidate)); + const rootHost = stripWww(toLowerHost(root)); + + return candidateHost === rootHost && candidateHost.length > 0; +} + +function normalizeForQueue(value: string | null) { + if (!value) { + return null; + } + + let url: URL; + try { + url = new URL(value); + } catch { + return null; + } + + const host = `${stripWww(url.hostname.toLowerCase())}${url.port ? `:${url.port}` : ""}`; + return `${url.protocol}//${host}${url.pathname.replace(/\/$/, "") || "/"}`; +} + +export function discoverRelevantSubpageUrls(links: string[], rootUrl: string) { + const root = normalizeCrawlUrl(rootUrl); + if (!root) { + return []; + } + + const parsedRoot = new URL(root); + const homepage = `${parsedRoot.protocol}//${stripWww( + parsedRoot.hostname.toLowerCase(), + )}${parsedRoot.port ? `:${parsedRoot.port}` : ""}/`; + + const seen = new Set([homepage]); + const buckets: string[][] = [[], [], [], []]; + + for (const link of links) { + const normalized = normalizeCrawlUrl(link, rootUrl); + if (!normalized || !isSameRegistrableHostishDomain(normalized, rootUrl)) { + continue; + } + + const canonical = normalizeForQueue(normalized); + if (!canonical || seen.has(canonical)) { + continue; + } + + let path: string; + try { + path = new URL(normalized).pathname.toLowerCase(); + } catch { + continue; + } + + for (const [priority, pattern] of RELEVANT_PATH_PATTERNS.entries()) { + if (pattern.test(path)) { + if (buckets[priority].length > 0) { + break; + } + buckets[priority].push(canonical); + seen.add(canonical); + break; + } + } + } + + const relevant = [...buckets.flat()]; + + return [homepage, ...relevant].slice(0, 5); +} + +function stripHtml(input: string) { + return input + .replace(//gi, " ") + .replace(//gi, " ") + .replace(/<[^>]*>/g, " ") + .replace(/\s+/g, " ") + .trim(); +} + +function stripLeadingToText(input: string) { + return input.replace(/<[^>]*>/g, "").replace(/\s+/g, " ").trim(); +} + +function decodeCommonEmailEntities(input: string) { + return input + .replace(/ | | /gi, " ") + .replace(/@|@|@/gi, "@") + .replace(/.|.|./gi, "."); +} + +function normalizeEmailExtractionInput(input: string) { + return decodeCommonEmailEntities(input) + .replace(//gi, " ") + .replace(//gi, " ") + .replace(/\s+/g, " ") + .trim(); +} + +function normalizeMailtoAddress(value: string) { + const strippedQuery = value.split("?")[0] ?? ""; + const withoutMailto = strippedQuery.replace(/^mailto:/i, ""); + try { + return decodeURIComponent(withoutMailto).trim(); + } catch { + return withoutMailto.trim(); + } +} + +function denormalizeObfuscatedEmail(value: string) { + const withAt = value + .replace(/\[\s*at\s*\]|\(\s*at\s*\)|\{\s*at\s*\}/gi, "@") + .replace(/\bpunkt\b|\bdot\b/gi, ".") + .replace(/\[\s*dot\s*\]|\(\s*dot\s*\)|\{\s*dot\s*\}/gi, "."); + + return withAt + .replace(/\s*@\s*/g, "@") + .replace(/\s*\.\s*/g, ".") + .replace(/\s+/g, ""); +} + +function addEmailCandidate( + entries: WebsiteCrawlEmailCandidate[], + seen: Set, + email: string, + source: string, + index: number, + length: number, + explicitPersons: Map, +) { + const normalized = normalizeEmailAddress(email); + if (!normalized || seen.has(normalized)) { + return; + } + + const businessContext = hasBusinessContactContext(source, index, length); + const explicitPerson = + explicitPersons.get(normalized) ?? getContactPersonForEmail(source, email, index); + + entries.push({ + email: normalized, + emailSource: null, + contactPerson: explicitPerson, + isBusinessContactAddress: businessContext, + }); + seen.add(normalized); +} + +function collectObfuscatedEmailCandidates( + source: string, + explicitPersons: Map, +) { + const normalizedSource = normalizeEmailExtractionInput(source); + const localPart = "[a-z0-9._%+-]{1,64}"; + const domainLabel = "[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?"; + const tld = "[a-z]{2,}"; + const strictAtSeparator = + "(?:@|\\[\\s*at\\s*\\]|\\(\\s*at\\s*\\)|\\{\\s*at\\s*\\})"; + const looseAtSeparator = "\\bat\\b"; + const atSeparator = `(?:${strictAtSeparator}|${looseAtSeparator})`; + const strictDotSeparator = + "(?:\\.|\\[\\s*(?:dot|punkt)\\s*\\]|\\(\\s*(?:dot|punkt)\\s*\\)|\\{\\s*(?:dot|punkt)\\s*\\})"; + const looseDotSeparator = "\\b(?:dot|punkt)\\b"; + const dotSeparator = `(?:${strictDotSeparator}|${looseDotSeparator})`; + + const obfuscatedEmailRegex = new RegExp( + `\\b(?${localPart})\\s*(?${atSeparator})\\s*(?${domainLabel}(?:\\s*${dotSeparator}\\s*${domainLabel})*\\s*${dotSeparator}\\s*${tld})\\b`, + "gi", + ); + + const candidates: WebsiteCrawlEmailCandidate[] = []; + const seen = new Set(); + + for (const match of normalizedSource.matchAll(obfuscatedEmailRegex)) { + const rawCandidate = match[0]; + if (!rawCandidate) { + continue; + } + + const localPartMatch = match.groups?.local ?? ""; + const atSeparatorMatch = match.groups?.at ?? ""; + const domainPartMatch = match.groups?.domain ?? ""; + const isBareAt = + /\bat\b/i.test(atSeparatorMatch) && !/@|\[|\(|\{/.test(atSeparatorMatch); + const hasBareDot = /\b(?:dot|punkt)\b/i.test(domainPartMatch); + + const deobfuscationIndex = match.index ?? -1; + if (deobfuscationIndex < 0) { + continue; + } + + if ((isBareAt || hasBareDot) && !GENERIC_BUSINESS_LOCALS.has(localPartMatch.toLowerCase()) && + !hasBusinessContactContext( + normalizedSource, + deobfuscationIndex, + rawCandidate.length, + )) { + continue; + } + + const normalized = denormalizeObfuscatedEmail(rawCandidate); + const normalizedEmail = normalizeEmailAddress(normalized); + if (!normalizedEmail || seen.has(normalizedEmail)) { + continue; + } + + const explicitPerson = + explicitPersons.get(normalizedEmail) ?? + getContactPersonForEmail(normalizedSource, rawCandidate, deobfuscationIndex); + const businessContext = hasBusinessContactContext( + normalizedSource, + deobfuscationIndex, + rawCandidate.length, + ); + candidates.push({ + email: normalizedEmail, + emailSource: null, + contactPerson: explicitPerson, + isBusinessContactAddress: businessContext, + }); + seen.add(normalizedEmail); + } + + return candidates; +} + +function getContactPersonForEmail( + text: string, + email: string, + index: number, +) { + const windowStart = Math.max(0, index - 120); + const windowEnd = Math.min(text.length, index + email.length + 120); + const context = text.slice(windowStart, windowEnd); + + const beforeEmailContext = context.slice(0, index - windowStart); + const anchorMatches = Array.from( + beforeEmailContext.matchAll(/]*>(.*?)<\/a>/gi), + ); + const nearestAnchor = anchorMatches.at(-1); + if (nearestAnchor?.[1]) { + const anchorText = stripLeadingToText(nearestAnchor[1]).trim(); + if (anchorText && !/@/.test(anchorText) && anchorText.length < 120) { + return anchorText; + } + } + + const nearMatch = context.match( + /(?:(?:^|[>\s])([A-ZÄÖÜ][a-zäöüßÄÖÜ]+\s+[A-ZÄÖÜ][a-zäöüßÄÖÜ-]+(?:\s+[A-ZÄÖÜ][a-zäöüßÄÖÜ-]+)?))$/u, + ); + if (nearMatch?.[1]) { + return stripLeadingToText(nearMatch[1]!).trim(); + } + + const directMatch = text.slice(0, index).match( + /([A-ZÄÖÜ][a-zäöüßÄÖÜ-]+\s+[A-ZÄÖÜ][a-zäöüßÄÖÜ-]+)\s*(?:,|\s+\()?\s*$/u, + ); + return directMatch?.[1]?.trim() ?? null; +} + +function hasBusinessContactContext(text: string, index: number, length: number) { + const context = text + .slice(Math.max(0, index - 140), Math.min(text.length, index + length + 140)) + .toLowerCase(); + + return CONTACT_CONTEXT_KEYWORDS.some((keyword) => context.includes(keyword)); +} + +function makePhoneNumberSet(input: string) { + const phoneRegex = /(?:\+?\d[\d\s./()-]{7,}\d)/g; + const matches = input.matchAll(phoneRegex); + const values = new Set(); + + for (const match of matches) { + const raw = match[0] ?? ""; + const normalized = raw.replace(/[^\d+]/g, ""); + if (normalized.length >= 7) { + values.add(raw.trim()); + values.add(normalized); + } + } + + return Array.from(values).filter((value) => value.length >= 7); +} + +function makeEmailCandidates(input: string) { + const emailRegex = /[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}(?:\b)?/gi; + const mailtoAnchors = input.matchAll( + /href=["']mailto:([^"'>\s]+)["'][^>]*>(.*?)<\/a>/gi, + ); + const normalizedInput = normalizeEmailExtractionInput(input); + const explicitPersons = new Map(); + + const entries: WebsiteCrawlEmailCandidate[] = []; + const seen = new Set(); + + for (const anchorMatch of mailtoAnchors) { + const rawHref = normalizeMailtoAddress(anchorMatch[1] ?? ""); + const email = normalizeEmailAddress(rawHref); + if (!email) { + continue; + } + const label = stripLeadingToText( + decodeCommonEmailEntities(anchorMatch[2] ?? ""), + ).trim(); + const normalizedLabelEmail = normalizeEmailAddress(label); + if (label && label.length <= 64 && !label.includes("@")) { + explicitPersons.set(email, label); + } + if (seen.has(email)) { + continue; + } + const anchorIndex = anchorMatch.index ?? -1; + if (anchorIndex < 0) { + continue; + } + const contactPerson = + normalizedLabelEmail && normalizedLabelEmail === email ? null : label || null; + entries.push({ + email, + emailSource: null, + contactPerson, + isBusinessContactAddress: hasBusinessContactContext( + input, + anchorIndex, + email.length, + ), + }); + seen.add(email); + } + + for (const match of normalizedInput.matchAll(emailRegex)) { + const rawEmail = match[0] ?? ""; + const idx = match.index ?? -1; + if (rawEmail.length === 0 || idx < 0) { + continue; + } + addEmailCandidate( + entries, + seen, + rawEmail, + normalizedInput, + idx, + rawEmail.length, + explicitPersons, + ); + } + + for (const candidate of collectObfuscatedEmailCandidates(input, explicitPersons)) { + if (seen.has(candidate.email)) { + continue; + } + entries.push(candidate); + seen.add(candidate.email); + } + + return entries; +} + +export function extractContactSignalsFromHtmlLikeText(input: string) { + const visibleText = stripHtml(input); + const phoneNumbers = makePhoneNumberSet(visibleText); + const emailCandidates = makeEmailCandidates(input); + + const lowerInput = input.toLowerCase(); + + const hasContactFormSignal = + /kontaktformular|anfrageformular|contact form|(); + const checkedUrls = input.checkedUrls ?? []; + for (const checkedUrl of checkedUrls) { + const normalizedCheckedUrl = normalizeCrawlUrl(checkedUrl, normalizedRoot ?? undefined); + if (!normalizedCheckedUrl || !isSameRegistrableHostishDomain(normalizedCheckedUrl, normalizedRoot)) { + continue; + } + const canonicalCheckedUrl = normalizeForQueue(normalizedCheckedUrl); + if (canonicalCheckedUrl) { + checkedUrlSet.add(canonicalCheckedUrl); + } + } + + const hasCheckedUrls = checkedUrlSet.size > 0; + + const brokenInternalLinksSet = new Set(); + + for (const entry of input.links ?? []) { + const href = typeof entry === "string" ? entry : (entry.href ?? ""); + const normalizedLink = normalizeCrawlUrl(href, normalizedRoot ?? undefined); + if (!normalizedLink || !isSameRegistrableHostishDomain(normalizedLink, normalizedRoot)) { + continue; + } + + const canonical = normalizeForQueue(normalizedLink); + if (!canonical) { + continue; + } + + if (hasCheckedUrls && !checkedUrlSet.has(canonical)) { + continue; + } + + let isBroken = false; + if (typeof entry !== "string") { + if (entry.isBroken === true) { + isBroken = true; + } + + const status = entry.status ?? entry.statusCode; + if (typeof status === "number" && (status >= 400 || status <= 0)) { + isBroken = true; + } + } + + if (isBroken) { + brokenInternalLinksSet.add(canonical); + } + } + + return { + https: finalUrl.startsWith("https://"), + finalUrl, + missingTitle: title.length === 0, + missingMetaDescription: metaDescription.length === 0, + hasVisibleContactPath, + brokenInternalLinks: Array.from(brokenInternalLinksSet), + }; +} diff --git a/package.json b/package.json index 1a76f35..afeeb13 100644 --- a/package.json +++ b/package.json @@ -13,12 +13,14 @@ "dependencies": { "@convex-dev/better-auth": "^0.12.2", "@hookform/resolvers": "^5.4.0", + "@sparticuz/chromium-min": "^149.0.0", "better-auth": "^1.6.14", "class-variance-authority": "^0.7.1", "clsx": "^2.1.1", "convex": "^1.40.0", "lucide-react": "^1.17.0", "next": "16.2.7", + "playwright-core": "^1.60.0", "radix-ui": "^1.4.3", "react": "19.2.4", "react-dom": "19.2.4", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 9cb119a..3a0ab4e 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -14,6 +14,9 @@ importers: '@hookform/resolvers': specifier: ^5.4.0 version: 5.4.0(react-hook-form@7.77.0(react@19.2.4)) + '@sparticuz/chromium-min': + specifier: ^149.0.0 + version: 149.0.0 better-auth: specifier: ^1.6.14 version: 1.6.14(next@16.2.7(@babel/core@7.29.7)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react-dom@19.2.4(react@19.2.4))(react@19.2.4) @@ -32,6 +35,9 @@ importers: next: specifier: 16.2.7 version: 16.2.7(@babel/core@7.29.7)(react-dom@19.2.4(react@19.2.4))(react@19.2.4) + playwright-core: + specifier: ^1.60.0 + version: 1.60.0 radix-ui: specifier: ^1.4.3 version: 1.4.3(@types/react-dom@19.2.3(@types/react@19.2.16))(@types/react@19.2.16)(react-dom@19.2.4(react@19.2.4))(react@19.2.4) @@ -1596,6 +1602,10 @@ packages: resolution: {integrity: sha512-tlqY9xq5ukxTUZBmoOp+m61cqwQD5pHJtFY3Mn8CA8ps6yghLH/Hw8UPdqg4OLmFW3IFlcXnQNmo/dh8HzXYIQ==} engines: {node: '>=18'} + '@sparticuz/chromium-min@149.0.0': + resolution: {integrity: sha512-/+QWJ6jDQnm/U7BITWVVcoe1CbuyW13pjonFpfBY67ZxePbaY/j4Ho+//n82AoGwugdkVVOYGY00KzMJzfYQdg==} + engines: {node: ^22.17.0 || >=24.0.0} + '@standard-schema/spec@1.1.0': resolution: {integrity: sha512-l2aFy5jALhniG5HgqrD6jXLi/rUWrKvqN/qJx6yoJsgKhblVd+iqqU4RCXavm/jPityDo5TCvKMnpjKnOriy0w==} @@ -2025,6 +2035,14 @@ packages: resolution: {integrity: sha512-qIj0G9wZbMGNLjLmg1PT6v2mE9AH2zlnADJD/2tC6E00hgmhUOfEB6greHPAfLRSufHqROIUTkw6E+M3lH0PTQ==} engines: {node: '>= 0.4'} + b4a@1.8.1: + resolution: {integrity: sha512-aiqre1Nr0B/6DgE2N5vwTc+2/oQZ4Wh1t4NznYY4E00y8LCt6NqdRv81so00oo27D8MVKTpUa/MwUUtBLXCoDw==} + peerDependencies: + react-native-b4a: '*' + peerDependenciesMeta: + react-native-b4a: + optional: true + balanced-match@1.0.2: resolution: {integrity: sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==} @@ -2032,6 +2050,47 @@ packages: resolution: {integrity: sha512-BLrgEcRTwX2o6gGxGOCNyMvGSp35YofuYzw9h1IMTRmKqttAZZVU67bdb9Pr2vUHA8+j3i2tJfjO6C6+4myGTA==} engines: {node: 18 || 20 || >=22} + bare-events@2.9.1: + resolution: {integrity: sha512-Z0oHEHAFDZkffN8Qc39zNZjQlMDkPJRyyyZieU1VH7u8c5S+qHZ2S8ixdKIAxEjfHO7FJxXmJWgteOghVanIsg==} + peerDependencies: + bare-abort-controller: '*' + peerDependenciesMeta: + bare-abort-controller: + optional: true + + bare-fs@4.7.2: + resolution: {integrity: sha512-aTvMFUWkBmjzKtEQMDGGDNF8bkfpD5N1b/FCwt7A3wrU4t1o/e/85Wzkluh6JlODCjqVESYCkQCdTXqZ9G7VFg==} + engines: {bare: '>=1.16.0'} + peerDependencies: + bare-buffer: '*' + peerDependenciesMeta: + bare-buffer: + optional: true + + bare-os@3.9.1: + resolution: {integrity: sha512-6M5XjcnsygQNPMCMPXSK379xrJFiZ/AEMNBmFEmQW8d/789VQATvriyi5r0HYTL9TkQ26rn3kgdTG3aisbrXkQ==} + engines: {bare: '>=1.14.0'} + + bare-path@3.0.1: + resolution: {integrity: sha512-ghj2DSK/2e99a1anTVPCV4m4YIYtrbXhfM7V3D7XZLOTsybnYyaJloymGqssQc8l/or0UoDyRtNQkmkEF/ysgQ==} + + bare-stream@2.13.1: + resolution: {integrity: sha512-Vp0cnjYyrEC4whYTymQ+YZi6pBpfiICZO3cfRG8sy67ZNWe951urv1x4eW1BKNngw3U+3fPYb5JQvHbCtxH7Ow==} + peerDependencies: + bare-abort-controller: '*' + bare-buffer: '*' + bare-events: '*' + peerDependenciesMeta: + bare-abort-controller: + optional: true + bare-buffer: + optional: true + bare-events: + optional: true + + bare-url@2.4.3: + resolution: {integrity: sha512-Kccpc7ACfXaxfeInfqKcZtW4pT5YBn1mesc4sCsun6sRwtbJ4h+sNOaksUpYEJUKfN65YWC6Bw2OJEFiKxq8nQ==} + baseline-browser-mapping@2.10.33: resolution: {integrity: sha512-bA6+tcSLpz2tIEdDXZPpPTIuxBcC4+w6SieaYyfigIa4h8GlFxbA17v22Vx3JUtuZQj9SgOsnbK+aTBzyDyEuw==} engines: {node: '>=6.0.0'} @@ -2430,6 +2489,9 @@ packages: resolution: {integrity: sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==} engines: {node: '>= 0.8'} + end-of-stream@1.4.5: + resolution: {integrity: sha512-ooEGc6HP26xXq/N+GCGOT0JKCLDGrq2bQUZrQ7gyrJiZANJ/8YDTxTpQBXGMn+WbIQXNVpyWymm7KYVICQnyOg==} + enhanced-resolve@5.22.1: resolution: {integrity: sha512-6QEuw3zoX1SJQc7b87aBXke/no+mG2bTBgw29gWMQonLmpEkWoCAVkl+M49e48AZlWzxiDzDZzYdp6kobcyLww==} engines: {node: '>=10.13.0'} @@ -2622,6 +2684,9 @@ packages: resolution: {integrity: sha512-aIL5Fx7mawVa300al2BnEE4iNvo1qETxLrPI/o05L7z6go7fCw1J6EQmbK4FmJ2AS7kgVF/KEZWufBfdClMcPg==} engines: {node: '>= 0.6'} + events-universal@1.0.1: + resolution: {integrity: sha512-LUd5euvbMLpwOF8m6ivPCbhQeSiYVNb8Vs0fQ8QjXo0JTkEHpz8pxdQf0gStltaPpw0Cca8b39KxvK9cfKRiAw==} + eventsource-parser@3.1.0: resolution: {integrity: sha512-kJezFj9YFAMLeORyi7aCLxLbD5/qWMQnoMVlVPyHIll7lgRJCc3JVln9Vgl9nwQi0YkMnhdGTMNn7CkRRAptMg==} engines: {node: '>=18.0.0'} @@ -2651,6 +2716,9 @@ packages: fast-deep-equal@3.1.3: resolution: {integrity: sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==} + fast-fifo@1.3.2: + resolution: {integrity: sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==} + fast-glob@3.3.1: resolution: {integrity: sha512-kNFPyjhh5cKjrUltxs+wFx+ZkbRaxxmZ+X0ZU31SOsxCEtP9VPgtq2teZw1DebupL5GmDaNQ6yKMMVcM41iqDg==} engines: {node: '>=8.6.0'} @@ -3550,6 +3618,11 @@ packages: resolution: {integrity: sha512-wQ0b/W4Fr01qtpHlqSqspcj3EhBvimsdh0KlHhH8HRZnMsEa0ea2fTULOXOS9ccQr3om+GcGRk4e+isrZWV8qQ==} engines: {node: '>=16.20.0'} + playwright-core@1.60.0: + resolution: {integrity: sha512-9bW6zvX/m0lEbgTKJ6YppOKx8H3VOPBMOCFh2irXFOT4BbHgrx5hPjwJYLT40Lu+4qtD36qKc/Hn56StUW57IA==} + engines: {node: '>=18'} + hasBin: true + possible-typed-array-names@1.1.0: resolution: {integrity: sha512-/+5VFTchJDoVj3bhoqi6UeymcD00DAwb1nJwamzPvHEszJ4FpF6SNNbUbOS8yI56qHzdV8eK0qEfOSiodkTdxg==} engines: {node: '>= 0.4'} @@ -3594,6 +3667,9 @@ packages: resolution: {integrity: sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg==} engines: {node: '>= 0.10'} + pump@3.0.4: + resolution: {integrity: sha512-VS7sjc6KR7e1ukRFhQSY5LM2uBWAUPiOPa/A3mkKmiMwSmRFUITt0xuj+/lesgnCv+dPIEYlkzrcyXgquIHMcA==} + punycode@2.3.1: resolution: {integrity: sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==} engines: {node: '>=6'} @@ -3853,6 +3929,9 @@ packages: resolution: {integrity: sha512-eLoXW/DHyl62zxY4SCaIgnRhuMr6ri4juEYARS8E6sCEqzKpOiE521Ucofdx+KnDZl5xmvGYaaKCk5FEOxJCoQ==} engines: {node: '>= 0.4'} + streamx@2.26.0: + resolution: {integrity: sha512-VvNG1K72Po/xwJzxZFnZ++Tbrv4lwSptsbkFuzXCJAYZvCK5nnxsvXU6ajqkv7chyiI1Y0YXq2Jh8Iy8Y7NF/A==} + strict-event-emitter@0.5.1: resolution: {integrity: sha512-vMgjE/GGEPEFnhFub6pa4FmJBRBVOLpIII2hvCZ8Kzb7K0hlHo7mQv6xYrBvCL2LtAIBwFUK8wvuJgTVSQ5MFQ==} @@ -3950,6 +4029,18 @@ packages: resolution: {integrity: sha512-uxc/zpqFg6x7C8vOE7lh6Lbda8eEL9zmVm/PLeTPBRhh1xCgdWaQ+J1CUieGpIfm2HdtsUpRv+HshiasBMcc6A==} engines: {node: '>=6'} + tar-fs@3.1.2: + resolution: {integrity: sha512-QGxxTxxyleAdyM3kpFs14ymbYmNFrfY+pHj7Z8FgtbZ7w2//VAgLMac7sT6nRpIHjppXO2AwwEOg0bPFVRcmXw==} + + tar-stream@3.2.0: + resolution: {integrity: sha512-ojzvCvVaNp6aOTFmG7jaRD0meowIAuPc3cMMhSgKiVWws1GyHbGd/xvnyuRKcKlMpt3qvxx6r0hreCNITP9hIg==} + + teex@1.0.1: + resolution: {integrity: sha512-eYE6iEI62Ni1H8oIa7KlDU6uQBtqr4Eajni3wX7rpfXD8ysFx8z0+dri+KWEPWpBsxXfxu58x/0jvTVT1ekOSg==} + + text-decoder@1.2.7: + resolution: {integrity: sha512-vlLytXkeP4xvEq2otHeJfSQIRyWxo/oZGEbXrtEEF9Hnmrdly59sUbzZ/QgyWuLYHctCHxFF4tRQZNQ9k60ExQ==} + tiny-invariant@1.3.3: resolution: {integrity: sha512-+FbBPE1o9QAYvviau/qC5SE3caw21q3xkvWKBtja5vgqOWIHHJ3ioaq1VPfn/Szqctz2bU/oYeKd9/z5BL+PVg==} @@ -5671,6 +5762,14 @@ snapshots: '@sindresorhus/merge-streams@4.0.0': {} + '@sparticuz/chromium-min@149.0.0': + dependencies: + tar-fs: 3.1.2 + transitivePeerDependencies: + - bare-abort-controller + - bare-buffer + - react-native-b4a + '@standard-schema/spec@1.1.0': {} '@standard-schema/utils@0.3.0': {} @@ -6078,10 +6177,44 @@ snapshots: axobject-query@4.1.0: {} + b4a@1.8.1: {} + balanced-match@1.0.2: {} balanced-match@4.0.4: {} + bare-events@2.9.1: {} + + bare-fs@4.7.2: + dependencies: + bare-events: 2.9.1 + bare-path: 3.0.1 + bare-stream: 2.13.1(bare-events@2.9.1) + bare-url: 2.4.3 + fast-fifo: 1.3.2 + transitivePeerDependencies: + - bare-abort-controller + - react-native-b4a + + bare-os@3.9.1: {} + + bare-path@3.0.1: + dependencies: + bare-os: 3.9.1 + + bare-stream@2.13.1(bare-events@2.9.1): + dependencies: + streamx: 2.26.0 + teex: 1.0.1 + optionalDependencies: + bare-events: 2.9.1 + transitivePeerDependencies: + - react-native-b4a + + bare-url@2.4.3: + dependencies: + bare-path: 3.0.1 + baseline-browser-mapping@2.10.33: {} better-auth@1.6.14(next@16.2.7(@babel/core@7.29.7)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react-dom@19.2.4(react@19.2.4))(react@19.2.4): @@ -6384,6 +6517,10 @@ snapshots: encodeurl@2.0.0: {} + end-of-stream@1.4.5: + dependencies: + once: 1.4.0 + enhanced-resolve@5.22.1: dependencies: graceful-fs: 4.2.11 @@ -6745,6 +6882,12 @@ snapshots: etag@1.8.1: {} + events-universal@1.0.1: + dependencies: + bare-events: 2.9.1 + transitivePeerDependencies: + - bare-abort-controller + eventsource-parser@3.1.0: {} eventsource@3.0.7: @@ -6818,6 +6961,8 @@ snapshots: fast-deep-equal@3.1.3: {} + fast-fifo@1.3.2: {} + fast-glob@3.3.1: dependencies: '@nodelib/fs.stat': 2.0.5 @@ -7654,6 +7799,8 @@ snapshots: pkce-challenge@5.0.1: {} + playwright-core@1.60.0: {} + possible-typed-array-names@1.1.0: {} postcss-selector-parser@7.1.1: @@ -7699,6 +7846,11 @@ snapshots: forwarded: 0.2.0 ipaddr.js: 1.9.1 + pump@3.0.4: + dependencies: + end-of-stream: 1.4.5 + once: 1.4.0 + punycode@2.3.1: {} qs@6.15.2: @@ -8101,6 +8253,15 @@ snapshots: es-errors: 1.3.0 internal-slot: 1.1.0 + streamx@2.26.0: + dependencies: + events-universal: 1.0.1 + fast-fifo: 1.3.2 + text-decoder: 1.2.7 + transitivePeerDependencies: + - bare-abort-controller + - react-native-b4a + strict-event-emitter@0.5.1: {} string-width@4.2.3: @@ -8208,6 +8369,42 @@ snapshots: tapable@2.3.3: {} + tar-fs@3.1.2: + dependencies: + pump: 3.0.4 + tar-stream: 3.2.0 + optionalDependencies: + bare-fs: 4.7.2 + bare-path: 3.0.1 + transitivePeerDependencies: + - bare-abort-controller + - bare-buffer + - react-native-b4a + + tar-stream@3.2.0: + dependencies: + b4a: 1.8.1 + bare-fs: 4.7.2 + fast-fifo: 1.3.2 + streamx: 2.26.0 + transitivePeerDependencies: + - bare-abort-controller + - bare-buffer + - react-native-b4a + + teex@1.0.1: + dependencies: + streamx: 2.26.0 + transitivePeerDependencies: + - bare-abort-controller + - react-native-b4a + + text-decoder@1.2.7: + dependencies: + b4a: 1.8.1 + transitivePeerDependencies: + - react-native-b4a + tiny-invariant@1.3.3: {} tinyglobby@0.2.17: diff --git a/tests/campaigns-board-layout.test.ts b/tests/campaigns-board-layout.test.ts new file mode 100644 index 0000000..c31a7f8 --- /dev/null +++ b/tests/campaigns-board-layout.test.ts @@ -0,0 +1,30 @@ +import assert from "node:assert/strict"; +import { readFile } from "node:fs/promises"; +import { join } from "node:path"; +import test from "node:test"; + +const campaignsBoardPath = join( + process.cwd(), + "components", + "campaigns", + "campaigns-board.tsx", +); + +test("campaign board renders campaigns as responsive cards", async () => { + const source = await readFile(campaignsBoardPath, "utf8"); + + assert.doesNotMatch(source, / { @@ -180,6 +181,69 @@ test("lead discovery lead record stores valid email and sets contactStatus to ne assert.equal(record.contactPerson, undefined); }); +test("scheduling helper triggers website enrichment for missing contact leads with website data", () => { + assert.equal( + shouldScheduleWebsiteEnrichment({ + websiteUrl: "https://www.example.de", + websiteDomain: "example.de", + contactStatus: "missing_contact", + }), + true, + ); +}); + +test("scheduling helper does not trigger without website data", () => { + assert.equal( + shouldScheduleWebsiteEnrichment({ + websiteUrl: null, + websiteDomain: "", + contactStatus: "missing_contact", + }), + false, + ); +}); + +test("scheduling helper does not trigger when contact status is already usable", () => { + assert.equal( + shouldScheduleWebsiteEnrichment({ + websiteUrl: "https://www.example.de", + websiteDomain: "example.de", + contactStatus: "new", + }), + false, + ); +}); + +test("scheduling helper does not trigger for audit-ready leads", () => { + assert.equal( + shouldScheduleWebsiteEnrichment({ + websiteUrl: "https://www.example.de", + websiteDomain: "example.de", + contactStatus: "audit_ready", + }), + false, + ); +}); + +test("scheduling helper preserves existing contact-status behavior beyond TASK-7", () => { + assert.equal( + shouldScheduleWebsiteEnrichment({ + websiteUrl: "https://www.example.de", + websiteDomain: "example.de", + contactStatus: "outreach_ready", + }), + false, + ); + assert.equal( + shouldScheduleWebsiteEnrichment({ + websiteUrl: "https://www.example.de", + websiteDomain: "example.de", + contactStatus: "do_not_contact", + }), + false, + ); +}); + test("lead discovery lead record stores normalized matching fields", () => { const record = buildLeadDiscoveryLeadRecord({ campaignId: "campaign-1", diff --git a/tests/lead-discovery-source.test.ts b/tests/lead-discovery-source.test.ts new file mode 100644 index 0000000..a692f70 --- /dev/null +++ b/tests/lead-discovery-source.test.ts @@ -0,0 +1,84 @@ +import assert from "node:assert/strict"; +import { readFileSync } from "node:fs"; +import path from "node:path"; +import test from "node:test"; + +const leadDiscoveryPath = path.join(process.cwd(), "convex", "leadDiscovery.ts"); +const leadDiscoverySource = readFileSync(leadDiscoveryPath, "utf8"); + +function hasPattern(source: string, pattern: RegExp) { + return pattern.test(source); +} + +function extractExportSource(name: string) { + const marker = `export const ${name} = `; + const declarationIndex = leadDiscoverySource.indexOf(marker); + + assert.notEqual(declarationIndex, -1, `Expected declaration for ${name}`); + + const openBraceIndex = leadDiscoverySource.indexOf("{", declarationIndex); + let depth = 0; + let end = -1; + + for (let index = openBraceIndex; index < leadDiscoverySource.length; index++) { + const char = leadDiscoverySource[index]; + + if (char === "{") { + depth += 1; + } else if (char === "}") { + depth -= 1; + if (depth === 0) { + end = index; + break; + } + } + } + + assert.notEqual(end, -1, `Expected balanced braces for ${name}`); + return leadDiscoverySource.slice(openBraceIndex, end + 1); +} + +test("startCampaignRun checks active campaign runs via by_type_and_status", () => { + const source = extractExportSource("startCampaignRun"); + + assert.equal( + hasPattern( + source, + /withIndex\(\s*"by_type_and_status"\s*,\s*\(q\)\s*=>[\s\S]*?q\.eq\("type",\s*"campaign"\)\.eq\("status",\s*"running"\),?[\s\S]*?\)/, + ), + true, + "Campaign starts should only consider running campaign-type runs as blockers", + ); +}); + +test("persistDiscoveredLeads does not schedule website enrichment jobs directly", () => { + const source = extractExportSource("persistDiscoveredLeads"); + + assert.equal( + source.includes("ctx.scheduler.runAfter"), + false, + "Lead persistence must not call runAfter", + ); +}); + +test("processCampaignRun schedules website enrichment after lead persistence", () => { + const source = extractExportSource("processCampaignRun"); + + const persistIndex = source.indexOf( + "internal.leadDiscovery.persistDiscoveredLeads", + ); + const queueCall = source.indexOf("internal.websiteEnrichment.queueLeadEnrichment"); + const eventMessageIndex = source.indexOf("Website-Kontaktanreicherung geplant."); + + assert.notEqual(persistIndex, -1, "processCampaignRun should persist discovered leads"); + assert.notEqual(queueCall, -1, "processCampaignRun should schedule website enrichment"); + assert.notEqual(eventMessageIndex, -1, "processCampaignRun should append enrichment schedule events"); + assert.ok( + persistIndex < queueCall, + "processCampaignRun should schedule enrichment after persistence succeeds", + ); + assert.ok( + queueCall < eventMessageIndex, + "processCampaignRun should append enrichment event after scheduling", + ); +}); diff --git a/tests/leads-review-table.test.ts b/tests/leads-review-table.test.ts new file mode 100644 index 0000000..b98737f --- /dev/null +++ b/tests/leads-review-table.test.ts @@ -0,0 +1,112 @@ +import assert from "node:assert/strict"; +import { readFile } from "node:fs/promises"; +import { join } from "node:path"; +import test from "node:test"; + +const leadsReviewPath = join( + process.cwd(), + "components", + "leads", + "leads-review-table.tsx", +); + +test("LeadsReviewTable uses compact card summaries with expandable review details", async () => { + const source = await readFile(leadsReviewPath, "utf8"); + + assert.doesNotMatch(source, /