feat: add website enrichment crawler

This commit is contained in:
2026-06-04 20:29:23 +02:00
parent ca42c8d5a6
commit 1f6e31c01c
25 changed files with 3539 additions and 56 deletions

View File

@@ -1,6 +1,11 @@
import { defineSchema, defineTable } from "convex/server";
import { v } from "convex/values";
import { tables as authTables } from "./betterAuth/schema";
import {
RUN_EVENT_LEVELS,
RUN_STATUSES,
RUN_TYPES,
} from "./domain";
const campaignStatus = v.union(v.literal("active"), v.literal("paused"));
const leadPriority = v.union(
@@ -75,24 +80,19 @@ const blacklistType = v.union(
v.literal("company"),
v.literal("google_place_id"),
);
const runType = v.union(
v.literal("campaign"),
v.literal("lead_discovery"),
v.literal("audit"),
v.literal("outreach"),
v.literal("lifecycle"),
);
const runStatus = v.union(
v.literal("pending"),
v.literal("running"),
v.literal("succeeded"),
v.literal("failed"),
v.literal("canceled"),
const websiteEnrichmentPageKind = v.union(
v.literal("homepage"),
v.literal("contact"),
v.literal("impressum"),
v.literal("services"),
v.literal("about"),
v.literal("team"),
v.literal("other"),
);
const runType = v.union(...RUN_TYPES.map((type) => v.literal(type)));
const runStatus = v.union(...RUN_STATUSES.map((status) => v.literal(status)));
const runEventLevel = v.union(
v.literal("info"),
v.literal("warning"),
v.literal("error"),
...RUN_EVENT_LEVELS.map((level) => v.literal(level)),
);
const screenshotViewport = v.union(v.literal("desktop"), v.literal("mobile"));
const settingsValue = v.union(v.string(), v.number(), v.boolean(), v.null());
@@ -255,6 +255,85 @@ export default defineSchema({
.index("by_auditId_and_viewport", ["auditId", "viewport"])
.index("by_storageId", ["storageId"]),
websiteCrawlPages: defineTable({
leadId: v.id("leads"),
runId: v.optional(v.id("agentRuns")),
sourceUrl: v.string(),
finalUrl: v.string(),
pageKind: websiteEnrichmentPageKind,
title: v.optional(v.string()),
metaDescription: v.optional(v.string()),
headings: v.array(v.string()),
visibleTextExcerpt: v.optional(v.string()),
hasContactFormSignal: v.boolean(),
hasContactCtaSignal: v.boolean(),
createdAt: v.number(),
})
.index("by_leadId", ["leadId"])
.index("by_runId", ["runId"])
.index("by_leadId_and_createdAt", ["leadId", "createdAt"]),
websiteCrawlLinks: defineTable({
leadId: v.id("leads"),
runId: v.optional(v.id("agentRuns")),
pageUrl: v.string(),
href: v.string(),
text: v.optional(v.string()),
isInternal: v.boolean(),
isBroken: v.optional(v.boolean()),
createdAt: v.number(),
})
.index("by_leadId", ["leadId"])
.index("by_runId", ["runId"]),
websiteEmailCandidates: defineTable({
leadId: v.id("leads"),
runId: v.optional(v.id("agentRuns")),
email: v.string(),
normalizedEmail: v.string(),
emailSource: v.string(),
sourceUrl: v.string(),
contactPerson: v.optional(v.string()),
isBusinessContactAddress: v.boolean(),
isGeneric: v.boolean(),
accepted: v.boolean(),
createdAt: v.number(),
})
.index("by_leadId", ["leadId"])
.index("by_normalizedEmail", ["normalizedEmail"])
.index("by_runId", ["runId"]),
websiteCrawlScreenshots: defineTable({
leadId: v.id("leads"),
runId: v.optional(v.id("agentRuns")),
storageId: v.id("_storage"),
viewport: screenshotViewport,
sourceUrl: v.string(),
capturedAt: v.number(),
width: v.number(),
height: v.number(),
mimeType: v.string(),
createdAt: v.number(),
})
.index("by_leadId", ["leadId"])
.index("by_runId", ["runId"])
.index("by_storageId", ["storageId"]),
websiteTechnicalChecks: defineTable({
leadId: v.id("leads"),
runId: v.optional(v.id("agentRuns")),
sourceUrl: v.string(),
finalUrl: v.optional(v.string()),
usesHttps: v.boolean(),
missingTitle: v.boolean(),
missingMetaDescription: v.boolean(),
hasVisibleContactPath: v.boolean(),
brokenInternalLinkCount: v.number(),
createdAt: v.number(),
})
.index("by_leadId", ["leadId"])
.index("by_runId", ["runId"]),
outreachRecords: defineTable({
leadId: v.id("leads"),
auditId: v.optional(v.id("audits")),
@@ -309,7 +388,9 @@ export default defineSchema({
updatedAt: v.number(),
})
.index("by_status", ["status"])
.index("by_type", ["type"])
.index("by_type_and_status", ["type", "status"])
.index("by_type_and_status_and_leadId", ["type", "status", "leadId"])
.index("by_campaignId_and_updatedAt", ["campaignId", "updatedAt"])
.index("by_campaignId_and_status", ["campaignId", "status"])
.index("by_auditId", ["auditId"]),