feat: add website enrichment crawler
This commit is contained in:
4
convex/_generated/api.d.ts
vendored
4
convex/_generated/api.d.ts
vendored
@@ -19,6 +19,8 @@ import type * as outreach from "../outreach.js";
|
||||
import type * as runs from "../runs.js";
|
||||
import type * as settings from "../settings.js";
|
||||
import type * as storage from "../storage.js";
|
||||
import type * as websiteEnrichment from "../websiteEnrichment.js";
|
||||
import type * as websiteEnrichmentAction from "../websiteEnrichmentAction.js";
|
||||
|
||||
import type {
|
||||
ApiFromModules,
|
||||
@@ -38,6 +40,8 @@ declare const fullApi: ApiFromModules<{
|
||||
runs: typeof runs;
|
||||
settings: typeof settings;
|
||||
storage: typeof storage;
|
||||
websiteEnrichment: typeof websiteEnrichment;
|
||||
websiteEnrichmentAction: typeof websiteEnrichmentAction;
|
||||
}>;
|
||||
|
||||
/**
|
||||
|
||||
@@ -84,6 +84,7 @@ export const RUN_TYPES = [
|
||||
"audit",
|
||||
"outreach",
|
||||
"lifecycle",
|
||||
"website_enrichment",
|
||||
] as const;
|
||||
export const RUN_STATUSES = [
|
||||
"pending",
|
||||
|
||||
@@ -17,6 +17,7 @@ import {
|
||||
buildLeadDiscoveryLeadRecord,
|
||||
buildLeadDiscoveryCounters,
|
||||
getLeadDiscoveryPriority,
|
||||
shouldScheduleWebsiteEnrichment,
|
||||
} from "../lib/lead-discovery-run";
|
||||
import { calculateNextRunAt } from "../lib/campaign-scheduling";
|
||||
|
||||
@@ -214,6 +215,11 @@ export const processCampaignRun = internalAction({
|
||||
skippedDuplicates: number;
|
||||
skippedBlacklisted: number;
|
||||
errors: number;
|
||||
websiteEnrichmentQueue: Array<{
|
||||
leadId: Id<"leads">;
|
||||
companyName: string;
|
||||
website: string;
|
||||
}>;
|
||||
} = await ctx.runMutation(internal.leadDiscovery.persistDiscoveredLeads, {
|
||||
runId: args.runId,
|
||||
campaignId: campaign._id,
|
||||
@@ -223,6 +229,31 @@ export const processCampaignRun = internalAction({
|
||||
candidates,
|
||||
});
|
||||
|
||||
for (const enrichment of result.websiteEnrichmentQueue) {
|
||||
await ctx.runMutation(internal.websiteEnrichment.queueLeadEnrichment, {
|
||||
leadId: enrichment.leadId,
|
||||
parentRunId: args.runId,
|
||||
});
|
||||
|
||||
await ctx.runMutation(internal.leadDiscovery.appendRunEvent, {
|
||||
runId: args.runId,
|
||||
level: "info",
|
||||
message: "Website-Kontaktanreicherung geplant.",
|
||||
details: [
|
||||
{
|
||||
label: "Unternehmen",
|
||||
value: enrichment.companyName,
|
||||
source: "google_places",
|
||||
},
|
||||
{
|
||||
label: "Website",
|
||||
value: enrichment.website,
|
||||
source: "google_places",
|
||||
},
|
||||
],
|
||||
});
|
||||
}
|
||||
|
||||
await ctx.runMutation(internal.leadDiscovery.finishCampaignRun, {
|
||||
runId: args.runId,
|
||||
status: "succeeded",
|
||||
@@ -275,7 +306,9 @@ export const startCampaignRun = internalMutation({
|
||||
|
||||
const activeRunning = await ctx.db
|
||||
.query("agentRuns")
|
||||
.withIndex("by_status", (q) => q.eq("status", "running"))
|
||||
.withIndex("by_type_and_status", (q) =>
|
||||
q.eq("type", "campaign").eq("status", "running"),
|
||||
)
|
||||
.take(1);
|
||||
|
||||
if (activeRunning.length > 0) {
|
||||
@@ -390,6 +423,11 @@ export const persistDiscoveredLeads = internalMutation({
|
||||
let skippedDuplicates = 0;
|
||||
let skippedBlacklisted = 0;
|
||||
let errors = 0;
|
||||
const websiteEnrichmentQueue: Array<{
|
||||
leadId: Id<"leads">;
|
||||
companyName: string;
|
||||
website: string;
|
||||
}> = [];
|
||||
|
||||
for (const candidate of args.candidates) {
|
||||
if (leadsCreated >= args.maxNewLeads) {
|
||||
@@ -556,8 +594,15 @@ export const persistDiscoveredLeads = internalMutation({
|
||||
lead.duplicateOfLeadId = probableDuplicateLead._id;
|
||||
}
|
||||
|
||||
await ctx.db.insert("leads", lead);
|
||||
const leadId = await ctx.db.insert("leads", lead);
|
||||
leadsCreated += 1;
|
||||
if (shouldScheduleWebsiteEnrichment(lead)) {
|
||||
websiteEnrichmentQueue.push({
|
||||
leadId,
|
||||
companyName: lead.companyName,
|
||||
website: lead.websiteDomain ?? lead.websiteUrl ?? "unbekannt",
|
||||
});
|
||||
}
|
||||
await ctx.db.insert("agentRunEvents", {
|
||||
runId: args.runId,
|
||||
level: "info",
|
||||
@@ -589,6 +634,7 @@ export const persistDiscoveredLeads = internalMutation({
|
||||
skippedDuplicates,
|
||||
skippedBlacklisted,
|
||||
errors,
|
||||
websiteEnrichmentQueue,
|
||||
};
|
||||
},
|
||||
});
|
||||
|
||||
@@ -1,26 +1,17 @@
|
||||
import { v } from "convex/values";
|
||||
|
||||
import { normalizeListLimit } from "./domain";
|
||||
import {
|
||||
RUN_EVENT_LEVELS,
|
||||
RUN_STATUSES,
|
||||
RUN_TYPES,
|
||||
normalizeListLimit,
|
||||
} from "./domain";
|
||||
import { mutation, query } from "./_generated/server";
|
||||
|
||||
const runType = v.union(
|
||||
v.literal("campaign"),
|
||||
v.literal("lead_discovery"),
|
||||
v.literal("audit"),
|
||||
v.literal("outreach"),
|
||||
v.literal("lifecycle"),
|
||||
);
|
||||
const runStatus = v.union(
|
||||
v.literal("pending"),
|
||||
v.literal("running"),
|
||||
v.literal("succeeded"),
|
||||
v.literal("failed"),
|
||||
v.literal("canceled"),
|
||||
);
|
||||
const runType = v.union(...RUN_TYPES.map((type) => v.literal(type)));
|
||||
const runStatus = v.union(...RUN_STATUSES.map((status) => v.literal(status)));
|
||||
const eventLevel = v.union(
|
||||
v.literal("info"),
|
||||
v.literal("warning"),
|
||||
v.literal("error"),
|
||||
...RUN_EVENT_LEVELS.map((level) => v.literal(level)),
|
||||
);
|
||||
|
||||
export const create = mutation({
|
||||
@@ -116,6 +107,16 @@ export const list = query({
|
||||
.take(limit);
|
||||
}
|
||||
|
||||
if (args.type) {
|
||||
const type = args.type;
|
||||
|
||||
return await ctx.db
|
||||
.query("agentRuns")
|
||||
.withIndex("by_type", (q) => q.eq("type", type))
|
||||
.order("desc")
|
||||
.take(limit);
|
||||
}
|
||||
|
||||
if (args.status) {
|
||||
const status = args.status;
|
||||
|
||||
|
||||
113
convex/schema.ts
113
convex/schema.ts
@@ -1,6 +1,11 @@
|
||||
import { defineSchema, defineTable } from "convex/server";
|
||||
import { v } from "convex/values";
|
||||
import { tables as authTables } from "./betterAuth/schema";
|
||||
import {
|
||||
RUN_EVENT_LEVELS,
|
||||
RUN_STATUSES,
|
||||
RUN_TYPES,
|
||||
} from "./domain";
|
||||
|
||||
const campaignStatus = v.union(v.literal("active"), v.literal("paused"));
|
||||
const leadPriority = v.union(
|
||||
@@ -75,24 +80,19 @@ const blacklistType = v.union(
|
||||
v.literal("company"),
|
||||
v.literal("google_place_id"),
|
||||
);
|
||||
const runType = v.union(
|
||||
v.literal("campaign"),
|
||||
v.literal("lead_discovery"),
|
||||
v.literal("audit"),
|
||||
v.literal("outreach"),
|
||||
v.literal("lifecycle"),
|
||||
);
|
||||
const runStatus = v.union(
|
||||
v.literal("pending"),
|
||||
v.literal("running"),
|
||||
v.literal("succeeded"),
|
||||
v.literal("failed"),
|
||||
v.literal("canceled"),
|
||||
const websiteEnrichmentPageKind = v.union(
|
||||
v.literal("homepage"),
|
||||
v.literal("contact"),
|
||||
v.literal("impressum"),
|
||||
v.literal("services"),
|
||||
v.literal("about"),
|
||||
v.literal("team"),
|
||||
v.literal("other"),
|
||||
);
|
||||
const runType = v.union(...RUN_TYPES.map((type) => v.literal(type)));
|
||||
const runStatus = v.union(...RUN_STATUSES.map((status) => v.literal(status)));
|
||||
const runEventLevel = v.union(
|
||||
v.literal("info"),
|
||||
v.literal("warning"),
|
||||
v.literal("error"),
|
||||
...RUN_EVENT_LEVELS.map((level) => v.literal(level)),
|
||||
);
|
||||
const screenshotViewport = v.union(v.literal("desktop"), v.literal("mobile"));
|
||||
const settingsValue = v.union(v.string(), v.number(), v.boolean(), v.null());
|
||||
@@ -255,6 +255,85 @@ export default defineSchema({
|
||||
.index("by_auditId_and_viewport", ["auditId", "viewport"])
|
||||
.index("by_storageId", ["storageId"]),
|
||||
|
||||
websiteCrawlPages: defineTable({
|
||||
leadId: v.id("leads"),
|
||||
runId: v.optional(v.id("agentRuns")),
|
||||
sourceUrl: v.string(),
|
||||
finalUrl: v.string(),
|
||||
pageKind: websiteEnrichmentPageKind,
|
||||
title: v.optional(v.string()),
|
||||
metaDescription: v.optional(v.string()),
|
||||
headings: v.array(v.string()),
|
||||
visibleTextExcerpt: v.optional(v.string()),
|
||||
hasContactFormSignal: v.boolean(),
|
||||
hasContactCtaSignal: v.boolean(),
|
||||
createdAt: v.number(),
|
||||
})
|
||||
.index("by_leadId", ["leadId"])
|
||||
.index("by_runId", ["runId"])
|
||||
.index("by_leadId_and_createdAt", ["leadId", "createdAt"]),
|
||||
|
||||
websiteCrawlLinks: defineTable({
|
||||
leadId: v.id("leads"),
|
||||
runId: v.optional(v.id("agentRuns")),
|
||||
pageUrl: v.string(),
|
||||
href: v.string(),
|
||||
text: v.optional(v.string()),
|
||||
isInternal: v.boolean(),
|
||||
isBroken: v.optional(v.boolean()),
|
||||
createdAt: v.number(),
|
||||
})
|
||||
.index("by_leadId", ["leadId"])
|
||||
.index("by_runId", ["runId"]),
|
||||
|
||||
websiteEmailCandidates: defineTable({
|
||||
leadId: v.id("leads"),
|
||||
runId: v.optional(v.id("agentRuns")),
|
||||
email: v.string(),
|
||||
normalizedEmail: v.string(),
|
||||
emailSource: v.string(),
|
||||
sourceUrl: v.string(),
|
||||
contactPerson: v.optional(v.string()),
|
||||
isBusinessContactAddress: v.boolean(),
|
||||
isGeneric: v.boolean(),
|
||||
accepted: v.boolean(),
|
||||
createdAt: v.number(),
|
||||
})
|
||||
.index("by_leadId", ["leadId"])
|
||||
.index("by_normalizedEmail", ["normalizedEmail"])
|
||||
.index("by_runId", ["runId"]),
|
||||
|
||||
websiteCrawlScreenshots: defineTable({
|
||||
leadId: v.id("leads"),
|
||||
runId: v.optional(v.id("agentRuns")),
|
||||
storageId: v.id("_storage"),
|
||||
viewport: screenshotViewport,
|
||||
sourceUrl: v.string(),
|
||||
capturedAt: v.number(),
|
||||
width: v.number(),
|
||||
height: v.number(),
|
||||
mimeType: v.string(),
|
||||
createdAt: v.number(),
|
||||
})
|
||||
.index("by_leadId", ["leadId"])
|
||||
.index("by_runId", ["runId"])
|
||||
.index("by_storageId", ["storageId"]),
|
||||
|
||||
websiteTechnicalChecks: defineTable({
|
||||
leadId: v.id("leads"),
|
||||
runId: v.optional(v.id("agentRuns")),
|
||||
sourceUrl: v.string(),
|
||||
finalUrl: v.optional(v.string()),
|
||||
usesHttps: v.boolean(),
|
||||
missingTitle: v.boolean(),
|
||||
missingMetaDescription: v.boolean(),
|
||||
hasVisibleContactPath: v.boolean(),
|
||||
brokenInternalLinkCount: v.number(),
|
||||
createdAt: v.number(),
|
||||
})
|
||||
.index("by_leadId", ["leadId"])
|
||||
.index("by_runId", ["runId"]),
|
||||
|
||||
outreachRecords: defineTable({
|
||||
leadId: v.id("leads"),
|
||||
auditId: v.optional(v.id("audits")),
|
||||
@@ -309,7 +388,9 @@ export default defineSchema({
|
||||
updatedAt: v.number(),
|
||||
})
|
||||
.index("by_status", ["status"])
|
||||
.index("by_type", ["type"])
|
||||
.index("by_type_and_status", ["type", "status"])
|
||||
.index("by_type_and_status_and_leadId", ["type", "status", "leadId"])
|
||||
.index("by_campaignId_and_updatedAt", ["campaignId", "updatedAt"])
|
||||
.index("by_campaignId_and_status", ["campaignId", "status"])
|
||||
.index("by_auditId", ["auditId"]),
|
||||
|
||||
408
convex/websiteEnrichment.ts
Normal file
408
convex/websiteEnrichment.ts
Normal file
@@ -0,0 +1,408 @@
|
||||
import { v } from "convex/values";
|
||||
import { internal } from "./_generated/api";
|
||||
import type { Doc } from "./_generated/dataModel";
|
||||
import { internalMutation } from "./_generated/server";
|
||||
import { normalizeEmailAddress } from "../lib/lead-discovery-google";
|
||||
|
||||
const RUN_COUNTER_TEMPLATE = {
|
||||
leadsFound: 0,
|
||||
leadsCreated: 0,
|
||||
auditsCreated: 0,
|
||||
outreachPrepared: 0,
|
||||
errors: 0,
|
||||
};
|
||||
|
||||
type WebsiteLead = Pick<Doc<"leads">, "_id" | "websiteUrl" | "contactStatus">;
|
||||
type LeadContactStatus = Doc<"leads">["contactStatus"];
|
||||
|
||||
export const queueLeadEnrichment = internalMutation({
|
||||
args: {
|
||||
leadId: v.id("leads"),
|
||||
parentRunId: v.optional(v.id("agentRuns")),
|
||||
},
|
||||
returns: v.union(v.id("agentRuns"), v.null()),
|
||||
handler: async (ctx, args) => {
|
||||
const now = Date.now();
|
||||
const lead = await ctx.db.get(args.leadId);
|
||||
|
||||
if (!lead || !lead.websiteUrl) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const activePending = await ctx.db
|
||||
.query("agentRuns")
|
||||
.withIndex("by_type_and_status_and_leadId", (q) =>
|
||||
q
|
||||
.eq("type", "website_enrichment")
|
||||
.eq("status", "pending")
|
||||
.eq("leadId", args.leadId),
|
||||
)
|
||||
.take(1);
|
||||
|
||||
const activeRunning = await ctx.db
|
||||
.query("agentRuns")
|
||||
.withIndex("by_type_and_status_and_leadId", (q) =>
|
||||
q
|
||||
.eq("type", "website_enrichment")
|
||||
.eq("status", "running")
|
||||
.eq("leadId", args.leadId),
|
||||
)
|
||||
.take(1);
|
||||
|
||||
if (activePending.length > 0) {
|
||||
return activePending[0]._id;
|
||||
}
|
||||
if (activeRunning.length > 0) {
|
||||
return activeRunning[0]._id;
|
||||
}
|
||||
|
||||
const runId = await ctx.db.insert("agentRuns", {
|
||||
type: "website_enrichment",
|
||||
leadId: args.leadId,
|
||||
status: "pending",
|
||||
counters: RUN_COUNTER_TEMPLATE,
|
||||
currentStep: "website_enrichment",
|
||||
createdAt: now,
|
||||
updatedAt: now,
|
||||
});
|
||||
|
||||
await ctx.db.insert("agentRunEvents", {
|
||||
runId,
|
||||
level: "info",
|
||||
message: "Website-Enrichment wurde in die Warteschlange gesetzt.",
|
||||
details: [
|
||||
{ label: "Lead", value: args.leadId },
|
||||
...(args.parentRunId
|
||||
? [{ label: "Parent-Run", value: args.parentRunId }]
|
||||
: []),
|
||||
],
|
||||
createdAt: now,
|
||||
});
|
||||
|
||||
await ctx.scheduler.runAfter(
|
||||
0,
|
||||
internal.websiteEnrichmentAction.processLeadEnrichment,
|
||||
{
|
||||
runId,
|
||||
},
|
||||
);
|
||||
|
||||
return runId;
|
||||
},
|
||||
});
|
||||
|
||||
export const startLeadEnrichmentRun = internalMutation({
|
||||
args: { runId: v.id("agentRuns") },
|
||||
handler: async (ctx, args): Promise<
|
||||
{ lead: WebsiteLead } | null
|
||||
> => {
|
||||
const now = Date.now();
|
||||
const run = await ctx.db.get(args.runId);
|
||||
|
||||
if (!run || run.type !== "website_enrichment" || run.status !== "pending") {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!run.leadId) {
|
||||
await ctx.db.patch(args.runId, {
|
||||
status: "failed",
|
||||
currentStep: "website_enrichment",
|
||||
errorSummary: "Der Lauf hat keine Lead-ID.",
|
||||
updatedAt: now,
|
||||
finishedAt: now,
|
||||
});
|
||||
await ctx.db.insert("agentRunEvents", {
|
||||
runId: args.runId,
|
||||
level: "error",
|
||||
message:
|
||||
"Website-Enrichment konnte nicht gestartet werden: Keine Lead-ID.",
|
||||
details: [{ label: "Lead-ID", value: run.leadId ?? "unbekannt" }],
|
||||
createdAt: now,
|
||||
});
|
||||
return null;
|
||||
}
|
||||
|
||||
const lead = await ctx.db.get(run.leadId);
|
||||
|
||||
if (!lead) {
|
||||
await ctx.db.patch(args.runId, {
|
||||
status: "failed",
|
||||
currentStep: "website_enrichment",
|
||||
errorSummary: "Lead fehlt oder besitzt keine Website.",
|
||||
updatedAt: now,
|
||||
finishedAt: now,
|
||||
});
|
||||
await ctx.db.insert("agentRunEvents", {
|
||||
runId: args.runId,
|
||||
level: "error",
|
||||
message:
|
||||
"Website-Enrichment konnte nicht gestartet werden: Kein Lead mit Website-URL.",
|
||||
details: [{ label: "Lead-ID", value: run.leadId }],
|
||||
createdAt: now,
|
||||
});
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!lead.websiteUrl) {
|
||||
await ctx.db.patch(args.runId, {
|
||||
status: "failed",
|
||||
currentStep: "website_enrichment",
|
||||
errorSummary: "Lead fehlt oder besitzt keine Website.",
|
||||
updatedAt: now,
|
||||
finishedAt: now,
|
||||
});
|
||||
await ctx.db.insert("agentRunEvents", {
|
||||
runId: args.runId,
|
||||
level: "error",
|
||||
message:
|
||||
"Website-Enrichment konnte nicht gestartet werden: Kein Lead mit Website-URL.",
|
||||
details: [{ label: "Lead-ID", value: lead._id }],
|
||||
createdAt: now,
|
||||
});
|
||||
await ctx.db.patch(lead._id, {
|
||||
contactStatusReason:
|
||||
"Website-URL fehlt für das Website-Enrichment.",
|
||||
updatedAt: now,
|
||||
});
|
||||
return null;
|
||||
}
|
||||
|
||||
await ctx.db.patch(args.runId, {
|
||||
status: "running",
|
||||
currentStep: "website_enrichment",
|
||||
startedAt: now,
|
||||
updatedAt: now,
|
||||
});
|
||||
|
||||
await ctx.db.insert("agentRunEvents", {
|
||||
runId: args.runId,
|
||||
level: "info",
|
||||
message: "Website-Enrichment gestartet.",
|
||||
details: [{ label: "Lead", value: lead._id }],
|
||||
createdAt: now,
|
||||
});
|
||||
|
||||
return {
|
||||
lead: {
|
||||
_id: lead._id,
|
||||
websiteUrl: lead.websiteUrl,
|
||||
contactStatus: lead.contactStatus,
|
||||
},
|
||||
};
|
||||
},
|
||||
});
|
||||
|
||||
export const persistLeadEnrichmentResult = internalMutation({
|
||||
args: {
|
||||
runId: v.id("agentRuns"),
|
||||
leadId: v.id("leads"),
|
||||
pages: v.array(
|
||||
v.object({
|
||||
sourceUrl: v.string(),
|
||||
finalUrl: v.string(),
|
||||
pageKind: v.union(
|
||||
v.literal("homepage"),
|
||||
v.literal("contact"),
|
||||
v.literal("impressum"),
|
||||
v.literal("services"),
|
||||
v.literal("about"),
|
||||
v.literal("team"),
|
||||
v.literal("other"),
|
||||
),
|
||||
title: v.optional(v.string()),
|
||||
metaDescription: v.optional(v.string()),
|
||||
headings: v.array(v.string()),
|
||||
visibleTextExcerpt: v.optional(v.string()),
|
||||
hasContactFormSignal: v.boolean(),
|
||||
hasContactCtaSignal: v.boolean(),
|
||||
}),
|
||||
),
|
||||
links: v.array(
|
||||
v.object({
|
||||
pageUrl: v.string(),
|
||||
href: v.string(),
|
||||
text: v.optional(v.string()),
|
||||
isInternal: v.boolean(),
|
||||
isBroken: v.optional(v.boolean()),
|
||||
}),
|
||||
),
|
||||
emailCandidates: v.array(
|
||||
v.object({
|
||||
email: v.string(),
|
||||
normalizedEmail: v.string(),
|
||||
emailSource: v.string(),
|
||||
sourceUrl: v.string(),
|
||||
contactPerson: v.optional(v.string()),
|
||||
isBusinessContactAddress: v.boolean(),
|
||||
isGeneric: v.boolean(),
|
||||
accepted: v.boolean(),
|
||||
}),
|
||||
),
|
||||
screenshots: v.array(
|
||||
v.object({
|
||||
storageId: v.id("_storage"),
|
||||
viewport: v.union(v.literal("desktop"), v.literal("mobile")),
|
||||
sourceUrl: v.string(),
|
||||
capturedAt: v.number(),
|
||||
width: v.number(),
|
||||
height: v.number(),
|
||||
mimeType: v.string(),
|
||||
}),
|
||||
),
|
||||
technicalChecks: v.array(
|
||||
v.object({
|
||||
sourceUrl: v.string(),
|
||||
finalUrl: v.optional(v.string()),
|
||||
usesHttps: v.boolean(),
|
||||
missingTitle: v.boolean(),
|
||||
missingMetaDescription: v.boolean(),
|
||||
hasVisibleContactPath: v.boolean(),
|
||||
brokenInternalLinkCount: v.number(),
|
||||
}),
|
||||
),
|
||||
},
|
||||
handler: async (ctx, args) => {
|
||||
const createdAt = Date.now();
|
||||
|
||||
for (const page of args.pages) {
|
||||
await ctx.db.insert("websiteCrawlPages", {
|
||||
...page,
|
||||
leadId: args.leadId,
|
||||
runId: args.runId,
|
||||
createdAt,
|
||||
});
|
||||
}
|
||||
|
||||
for (const link of args.links) {
|
||||
await ctx.db.insert("websiteCrawlLinks", {
|
||||
...link,
|
||||
leadId: args.leadId,
|
||||
runId: args.runId,
|
||||
createdAt,
|
||||
});
|
||||
}
|
||||
|
||||
for (const candidate of args.emailCandidates) {
|
||||
await ctx.db.insert("websiteEmailCandidates", {
|
||||
...candidate,
|
||||
leadId: args.leadId,
|
||||
runId: args.runId,
|
||||
createdAt,
|
||||
});
|
||||
}
|
||||
|
||||
for (const screenshot of args.screenshots) {
|
||||
await ctx.db.insert("websiteCrawlScreenshots", {
|
||||
...screenshot,
|
||||
leadId: args.leadId,
|
||||
runId: args.runId,
|
||||
createdAt,
|
||||
});
|
||||
}
|
||||
|
||||
for (const checks of args.technicalChecks) {
|
||||
await ctx.db.insert("websiteTechnicalChecks", {
|
||||
...checks,
|
||||
leadId: args.leadId,
|
||||
runId: args.runId,
|
||||
createdAt,
|
||||
});
|
||||
}
|
||||
},
|
||||
});
|
||||
|
||||
export const finishLeadEnrichmentRun = internalMutation({
|
||||
args: {
|
||||
runId: v.id("agentRuns"),
|
||||
status: v.union(
|
||||
v.literal("succeeded"),
|
||||
v.literal("failed"),
|
||||
v.literal("canceled"),
|
||||
),
|
||||
currentStep: v.optional(v.string()),
|
||||
errorSummary: v.optional(v.string()),
|
||||
errors: v.optional(v.number()),
|
||||
},
|
||||
handler: async (ctx, args) => {
|
||||
const now = Date.now();
|
||||
|
||||
await ctx.db.patch(args.runId, {
|
||||
status: args.status,
|
||||
updatedAt: now,
|
||||
finishedAt: now,
|
||||
currentStep: args.currentStep ?? "website_enrichment",
|
||||
errorSummary: args.errorSummary,
|
||||
counters: {
|
||||
leadsFound: 1,
|
||||
leadsCreated: 0,
|
||||
auditsCreated: 0,
|
||||
outreachPrepared: 0,
|
||||
errors: args.errors ?? 0,
|
||||
},
|
||||
});
|
||||
},
|
||||
});
|
||||
|
||||
export const patchLeadFromWebsiteEnrichment = internalMutation({
|
||||
args: {
|
||||
leadId: v.id("leads"),
|
||||
email: v.optional(v.string()),
|
||||
emailSource: v.optional(v.string()),
|
||||
contactPerson: v.optional(v.string()),
|
||||
currentContactStatus: v.union(
|
||||
v.literal("new"),
|
||||
v.literal("missing_contact"),
|
||||
v.literal("audit_ready"),
|
||||
v.literal("outreach_ready"),
|
||||
v.literal("contacted"),
|
||||
v.literal("replied"),
|
||||
v.literal("do_not_contact"),
|
||||
),
|
||||
contactStatusReason: v.optional(v.string()),
|
||||
},
|
||||
handler: async (ctx, args) => {
|
||||
const lead = await ctx.db.get(args.leadId);
|
||||
if (!lead) {
|
||||
return null;
|
||||
}
|
||||
|
||||
type LeadPatch = {
|
||||
email?: string;
|
||||
normalizedEmail?: string;
|
||||
emailSource?: string;
|
||||
contactPerson?: string;
|
||||
contactStatus?: LeadContactStatus;
|
||||
contactStatusReason?: string;
|
||||
updatedAt: number;
|
||||
};
|
||||
|
||||
const patch: LeadPatch = {
|
||||
updatedAt: Date.now(),
|
||||
};
|
||||
|
||||
if (args.email && args.emailSource) {
|
||||
const normalized = normalizeEmailAddress(args.email);
|
||||
if (normalized) {
|
||||
patch.email = normalized;
|
||||
patch.normalizedEmail = normalized;
|
||||
patch.emailSource = args.emailSource;
|
||||
}
|
||||
}
|
||||
|
||||
if (args.contactPerson) {
|
||||
patch.contactPerson = args.contactPerson;
|
||||
}
|
||||
|
||||
if (args.contactStatusReason !== undefined) {
|
||||
patch.contactStatusReason = args.contactStatusReason;
|
||||
} else if (args.email && args.currentContactStatus === "missing_contact") {
|
||||
patch.contactStatus = "new";
|
||||
}
|
||||
|
||||
if (Object.keys(patch).length > 1) {
|
||||
await ctx.db.patch(args.leadId, patch);
|
||||
}
|
||||
|
||||
return args.leadId;
|
||||
},
|
||||
});
|
||||
725
convex/websiteEnrichmentAction.ts
Normal file
725
convex/websiteEnrichmentAction.ts
Normal file
@@ -0,0 +1,725 @@
|
||||
"use node";
|
||||
|
||||
import type { Browser, BrowserContext } from "playwright-core";
|
||||
import { createHash } from "node:crypto";
|
||||
import { access, readFile, rm, writeFile } from "node:fs/promises";
|
||||
import { tmpdir } from "node:os";
|
||||
import path from "node:path";
|
||||
import { v } from "convex/values";
|
||||
import {
|
||||
buildTechnicalChecks,
|
||||
discoverRelevantSubpageUrls,
|
||||
extractContactSignalsFromHtmlLikeText,
|
||||
isSameRegistrableHostishDomain,
|
||||
normalizeCrawlUrl,
|
||||
} from "../lib/website-crawler";
|
||||
import {
|
||||
getUsableContactEmailFromEntries,
|
||||
normalizeEmailAddress,
|
||||
} from "../lib/lead-discovery-google";
|
||||
import { api, internal } from "./_generated/api";
|
||||
import type { Doc, Id } from "./_generated/dataModel";
|
||||
import { internalAction, type ActionCtx } from "./_generated/server";
|
||||
|
||||
const DEFAULT_CRAWL_TIMEOUT_MS = 60_000;
|
||||
const DEFAULT_CRAWL_MAX_PAGES = 5;
|
||||
const MAX_PERSISTED_LINKS = 120;
|
||||
const MAX_PERSISTED_EMAIL_CANDIDATES = 40;
|
||||
const SCREENSHOT_MIME_TYPE = "image/png";
|
||||
const CHROMIUM_SOURCE_MARKER_FILE = path.join(tmpdir(), "chromium-source.sha256");
|
||||
const CHROMIUM_EXECUTABLE_PATH = path.join(tmpdir(), "chromium");
|
||||
const CHROMIUM_PACK_PATH = path.join(tmpdir(), "chromium-pack");
|
||||
const GENERIC_EMAIL_LOCALS = new Set([
|
||||
"info",
|
||||
"kontakt",
|
||||
"contact",
|
||||
"sales",
|
||||
"team",
|
||||
"support",
|
||||
"service",
|
||||
"hello",
|
||||
"marketing",
|
||||
"admin",
|
||||
"office",
|
||||
"impressum",
|
||||
"post",
|
||||
]);
|
||||
const CHROMIUM_EXECUTABLE_SOURCE_ENV_VARS = [
|
||||
"TASK8_BROWSER_ASSET_URL",
|
||||
"TASK8_CHROMIUM_EXECUTABLE_URL",
|
||||
"TASK8_CHROMIUM_EXECUTABLE",
|
||||
];
|
||||
|
||||
type EnrichmentPageKind =
|
||||
| "homepage"
|
||||
| "contact"
|
||||
| "impressum"
|
||||
| "services"
|
||||
| "about"
|
||||
| "team"
|
||||
| "other";
|
||||
type CrawlPageLink = {
|
||||
href: string;
|
||||
text: string;
|
||||
isInternal: boolean;
|
||||
};
|
||||
type PersistedCrawlLink = CrawlPageLink & {
|
||||
pageUrl: string;
|
||||
};
|
||||
type PageResult = {
|
||||
sourceUrl: string;
|
||||
finalUrl: string;
|
||||
pageKind: EnrichmentPageKind;
|
||||
title: string;
|
||||
metaDescription: string;
|
||||
headings: string[];
|
||||
visibleText: string;
|
||||
links: CrawlPageLink[];
|
||||
emailCandidates: Array<{
|
||||
email: string;
|
||||
emailSource: string;
|
||||
contactPerson: string | null;
|
||||
isBusinessContactAddress: boolean;
|
||||
isGeneric: boolean;
|
||||
sourceUrl: string;
|
||||
accepted: boolean;
|
||||
normalizedEmail: string;
|
||||
}>;
|
||||
hasContactFormSignal: boolean;
|
||||
hasContactCtaSignal: boolean;
|
||||
};
|
||||
type StoredScreenshot = {
|
||||
storageId: Id<"_storage">;
|
||||
viewport: "desktop" | "mobile";
|
||||
sourceUrl: string;
|
||||
capturedAt: number;
|
||||
width: number;
|
||||
height: number;
|
||||
mimeType: string;
|
||||
};
|
||||
|
||||
type WebsiteLead = Pick<
|
||||
Doc<"leads">,
|
||||
"_id" | "websiteUrl" | "contactStatus"
|
||||
>;
|
||||
type StartedLead = {
|
||||
lead: WebsiteLead;
|
||||
};
|
||||
|
||||
type ServerlessChromiumModule = {
|
||||
args: string[];
|
||||
executablePath: (input?: string) => Promise<string>;
|
||||
inflate: (filePath: string) => Promise<string>;
|
||||
setupLambdaEnvironment: (baseLibPath: string) => void;
|
||||
};
|
||||
|
||||
function messageFromError(error: unknown) {
|
||||
return error instanceof Error ? error.message : String(error);
|
||||
}
|
||||
|
||||
function readPositiveIntEnv(key: string, fallback: number) {
|
||||
const raw = process.env[key]?.trim();
|
||||
if (!raw) {
|
||||
return fallback;
|
||||
}
|
||||
const parsed = Number.parseInt(raw, 10);
|
||||
return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback;
|
||||
}
|
||||
|
||||
function crawlTimeoutMs() {
|
||||
return readPositiveIntEnv("TASK8_CRAWL_TIMEOUT_MS", DEFAULT_CRAWL_TIMEOUT_MS);
|
||||
}
|
||||
|
||||
function crawlMaxPages() {
|
||||
return Math.max(
|
||||
1,
|
||||
Math.min(
|
||||
DEFAULT_CRAWL_MAX_PAGES,
|
||||
readPositiveIntEnv("TASK8_CRAWL_MAX_PAGES", DEFAULT_CRAWL_MAX_PAGES),
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
function makePageKind(url: string, rootUrl: string): EnrichmentPageKind {
|
||||
const normalizedRoot = normalizeCrawlUrl(rootUrl);
|
||||
if (!normalizedRoot) {
|
||||
return "other";
|
||||
}
|
||||
|
||||
const homepagePath = new URL(normalizedRoot).pathname.replace(/\/$/, "") || "/";
|
||||
let pageUrl: string;
|
||||
try {
|
||||
pageUrl = new URL(url).pathname.toLowerCase();
|
||||
} catch {
|
||||
return "other";
|
||||
}
|
||||
|
||||
if (pageUrl === homepagePath || pageUrl === homepagePath.replace(/\/$/, "")) {
|
||||
return "homepage";
|
||||
}
|
||||
|
||||
const normalizedPath = pageUrl.toLowerCase();
|
||||
if (/(?:^|\/)(kontakt|contact)(?:[-/]|$)/.test(normalizedPath)) {
|
||||
return "contact";
|
||||
}
|
||||
if (/(?:^|\/)(impressum|imprint)(?:[-/]|$)/.test(normalizedPath)) {
|
||||
return "impressum";
|
||||
}
|
||||
if (/(?:^|\/)(leistungen|angebot|services?)(?:[-/]|$)/.test(normalizedPath)) {
|
||||
return "services";
|
||||
}
|
||||
if (/(?:^|\/)(ueber|über|about|team)(?:[-/]|$)/.test(normalizedPath)) {
|
||||
return "about";
|
||||
}
|
||||
|
||||
return "other";
|
||||
}
|
||||
|
||||
function trimExcerpt(value: string) {
|
||||
return value.replace(/\s+/g, " ").trim().slice(0, 1200);
|
||||
}
|
||||
|
||||
function isGenericBusinessEmail(email: string) {
|
||||
const local = email.split("@")[0]?.toLowerCase() ?? "";
|
||||
const base = local.split("+")[0] ?? "";
|
||||
return GENERIC_EMAIL_LOCALS.has(base);
|
||||
}
|
||||
|
||||
async function loadPlaywrightModules() {
|
||||
const [playwrightCore, chromiumPackage] = await Promise.all([
|
||||
import("playwright-core"),
|
||||
import("@sparticuz/chromium-min"),
|
||||
]);
|
||||
return {
|
||||
playwrightCore,
|
||||
serverlessChromium: {
|
||||
args: chromiumPackage.default.args,
|
||||
executablePath: chromiumPackage.default.executablePath,
|
||||
inflate: chromiumPackage.inflate,
|
||||
setupLambdaEnvironment: chromiumPackage.setupLambdaEnvironment,
|
||||
} as ServerlessChromiumModule,
|
||||
};
|
||||
}
|
||||
|
||||
function getChromiumExecutableSource() {
|
||||
for (const key of CHROMIUM_EXECUTABLE_SOURCE_ENV_VARS) {
|
||||
const value = process.env[key]?.trim();
|
||||
if (value) {
|
||||
return value;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
function getChromiumSourceMarker(source: string) {
|
||||
return createHash("sha256").update(source).digest("hex");
|
||||
}
|
||||
|
||||
async function clearChromiumCacheForSourceMismatch(executableSource: string) {
|
||||
const nextMarker = getChromiumSourceMarker(executableSource);
|
||||
const marker = await readFile(CHROMIUM_SOURCE_MARKER_FILE, "utf8").catch(() => null);
|
||||
if ((marker ?? "").trim() === nextMarker) {
|
||||
return;
|
||||
}
|
||||
|
||||
await Promise.all([
|
||||
rm(CHROMIUM_EXECUTABLE_PATH, { force: true, recursive: true }),
|
||||
rm(CHROMIUM_PACK_PATH, { force: true, recursive: true }),
|
||||
]);
|
||||
}
|
||||
|
||||
async function resolveChromiumExecutablePath(
|
||||
chromium: ServerlessChromiumModule,
|
||||
) {
|
||||
const executableSource = getChromiumExecutableSource();
|
||||
if (!executableSource) {
|
||||
throw new Error(
|
||||
`Set TASK8_BROWSER_ASSET_URL (or legacy TASK8_CHROMIUM_EXECUTABLE_URL / TASK8_CHROMIUM_EXECUTABLE) to configure the Chromium source; no source is configured.`,
|
||||
);
|
||||
}
|
||||
|
||||
await clearChromiumCacheForSourceMismatch(executableSource);
|
||||
const executablePath = await chromium.executablePath(executableSource);
|
||||
await writeFile(
|
||||
CHROMIUM_SOURCE_MARKER_FILE,
|
||||
getChromiumSourceMarker(executableSource),
|
||||
);
|
||||
|
||||
return executablePath;
|
||||
}
|
||||
|
||||
async function captureHomepageScreenshot(
|
||||
ctx: ActionCtx,
|
||||
context: BrowserContext,
|
||||
homepageUrl: string,
|
||||
viewport: "desktop" | "mobile",
|
||||
timeoutMs: number,
|
||||
) {
|
||||
const page = await context.newPage();
|
||||
try {
|
||||
await page.goto(homepageUrl, {
|
||||
waitUntil: "domcontentloaded",
|
||||
timeout: timeoutMs,
|
||||
});
|
||||
const sourceUrl = page.url();
|
||||
const screenshot = await page.screenshot({
|
||||
fullPage: true,
|
||||
type: "png",
|
||||
});
|
||||
const storageId = await ctx.storage.store(
|
||||
new Blob([new Uint8Array(screenshot)], { type: SCREENSHOT_MIME_TYPE }),
|
||||
);
|
||||
const viewportSize = page.viewportSize() ?? { width: 0, height: 0 };
|
||||
|
||||
return {
|
||||
storageId,
|
||||
viewport,
|
||||
sourceUrl,
|
||||
capturedAt: Date.now(),
|
||||
width: viewportSize.width,
|
||||
height: viewportSize.height,
|
||||
mimeType: SCREENSHOT_MIME_TYPE,
|
||||
} satisfies StoredScreenshot;
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
}
|
||||
|
||||
async function crawlPage(
|
||||
context: BrowserContext,
|
||||
targetUrl: string,
|
||||
rootUrl: string,
|
||||
timeoutMs: number,
|
||||
) {
|
||||
const page = await context.newPage();
|
||||
try {
|
||||
const response = await page.goto(targetUrl, {
|
||||
waitUntil: "domcontentloaded",
|
||||
timeout: timeoutMs,
|
||||
});
|
||||
if (!response) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const finalUrl = page.url();
|
||||
const title = await page.title().catch(() => "");
|
||||
const metaDescription = await page
|
||||
.evaluate(() => {
|
||||
const meta = document.querySelector(
|
||||
"meta[name='description']",
|
||||
) as HTMLMetaElement | null;
|
||||
return meta?.content ?? "";
|
||||
})
|
||||
.catch(() => "");
|
||||
const content = await page.content();
|
||||
const signals = extractContactSignalsFromHtmlLikeText(content);
|
||||
const headings = await page
|
||||
.evaluate(() =>
|
||||
Array.from(document.querySelectorAll("h1, h2, h3"))
|
||||
.map((element) => element.textContent?.trim() ?? "")
|
||||
.filter((heading) => heading.length > 0),
|
||||
)
|
||||
.catch(() => []);
|
||||
const visibleText = await page.evaluate(() => {
|
||||
return document.body?.innerText ?? "";
|
||||
});
|
||||
const rawLinks = await page
|
||||
.evaluate(() =>
|
||||
Array.from(document.querySelectorAll("a[href]")).map((anchor) => ({
|
||||
href: anchor.getAttribute("href") ?? "",
|
||||
text: anchor.textContent?.trim() ?? "",
|
||||
})),
|
||||
)
|
||||
.catch(() => []);
|
||||
|
||||
const normalizedLinks = rawLinks
|
||||
.map((link) => {
|
||||
const normalizedHref = normalizeCrawlUrl(link.href, finalUrl);
|
||||
if (!normalizedHref) {
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
href: normalizedHref,
|
||||
text: link.text,
|
||||
isInternal: isSameRegistrableHostishDomain(normalizedHref, rootUrl),
|
||||
};
|
||||
})
|
||||
.filter(
|
||||
(entry): entry is { href: string; text: string; isInternal: boolean } =>
|
||||
entry !== null,
|
||||
);
|
||||
|
||||
const emailCandidates = signals.emailCandidates
|
||||
.map((entry) => {
|
||||
const normalizedEmail = normalizeEmailAddress(entry.email);
|
||||
if (!normalizedEmail) {
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
email: normalizedEmail,
|
||||
emailSource: finalUrl,
|
||||
contactPerson: entry.contactPerson ?? null,
|
||||
isBusinessContactAddress: entry.isBusinessContactAddress,
|
||||
isGeneric: isGenericBusinessEmail(normalizedEmail),
|
||||
sourceUrl: finalUrl,
|
||||
accepted: false,
|
||||
normalizedEmail,
|
||||
};
|
||||
})
|
||||
.filter((entry): entry is NonNullable<typeof entry> => entry !== null);
|
||||
|
||||
return {
|
||||
sourceUrl: finalUrl,
|
||||
finalUrl,
|
||||
pageKind: makePageKind(targetUrl, rootUrl),
|
||||
title,
|
||||
metaDescription,
|
||||
headings,
|
||||
visibleText,
|
||||
links: normalizedLinks,
|
||||
emailCandidates,
|
||||
hasContactFormSignal: signals.hasContactFormSignal,
|
||||
hasContactCtaSignal: signals.hasContactCtaSignal,
|
||||
} satisfies PageResult;
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
}
|
||||
|
||||
function deduplicateLeadEmailCandidates(
|
||||
candidates: PageResult["emailCandidates"],
|
||||
) {
|
||||
const unique = new Map<string, PageResult["emailCandidates"][number]>();
|
||||
|
||||
for (const candidate of candidates) {
|
||||
if (!unique.has(candidate.normalizedEmail)) {
|
||||
unique.set(candidate.normalizedEmail, candidate);
|
||||
}
|
||||
}
|
||||
|
||||
return [...unique.values()];
|
||||
}
|
||||
|
||||
function deduplicateCrawlLinks(links: PersistedCrawlLink[]) {
|
||||
const unique = new Map<string, PersistedCrawlLink>();
|
||||
|
||||
for (const link of links) {
|
||||
if (!unique.has(link.href)) {
|
||||
unique.set(link.href, link);
|
||||
}
|
||||
}
|
||||
|
||||
return [...unique.values()];
|
||||
}
|
||||
|
||||
export const processLeadEnrichment = internalAction({
|
||||
args: { runId: v.id("agentRuns") },
|
||||
handler: async (ctx, args) => {
|
||||
let started: StartedLead | null = null;
|
||||
const runId = args.runId;
|
||||
let browser: Browser | null = null;
|
||||
let desktopContext: BrowserContext | null = null;
|
||||
let mobileContext: BrowserContext | null = null;
|
||||
|
||||
try {
|
||||
started = await ctx.runMutation(internal.websiteEnrichment.startLeadEnrichmentRun, {
|
||||
runId,
|
||||
});
|
||||
|
||||
if (!started) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const rootUrl = normalizeCrawlUrl(started.lead.websiteUrl);
|
||||
if (!rootUrl) {
|
||||
await ctx.runMutation(internal.websiteEnrichment.finishLeadEnrichmentRun, {
|
||||
runId,
|
||||
status: "failed",
|
||||
currentStep: "website_enrichment",
|
||||
errorSummary: "Ungültige Website-URL.",
|
||||
errors: 1,
|
||||
});
|
||||
await ctx.runMutation(api.runs.appendEvent, {
|
||||
runId,
|
||||
level: "error",
|
||||
message: "Website-Enrichment fehlgeschlagen: Ungültige Website-URL.",
|
||||
details: [{ label: "Lead", value: started.lead._id }],
|
||||
});
|
||||
await ctx.runMutation(internal.websiteEnrichment.patchLeadFromWebsiteEnrichment, {
|
||||
leadId: started.lead._id,
|
||||
currentContactStatus: started.lead.contactStatus,
|
||||
contactStatusReason:
|
||||
"Website-Enrichment fehlgeschlagen: Ungültige Website-URL.",
|
||||
});
|
||||
return null;
|
||||
}
|
||||
|
||||
const timeoutMs = crawlTimeoutMs();
|
||||
const maxPages = crawlMaxPages();
|
||||
|
||||
const { playwrightCore, serverlessChromium } =
|
||||
await loadPlaywrightModules();
|
||||
const executablePath = await resolveChromiumExecutablePath(
|
||||
serverlessChromium,
|
||||
);
|
||||
|
||||
const prepareChromiumSharedLibraries = async (
|
||||
chromiumRuntime: ServerlessChromiumModule,
|
||||
) => {
|
||||
const runtimeArchivePath = path.join(
|
||||
CHROMIUM_PACK_PATH,
|
||||
"al2023.tar.br",
|
||||
);
|
||||
await access(runtimeArchivePath).catch(() => {
|
||||
throw new Error(
|
||||
`AL2023 shared library archive not found at ${runtimeArchivePath}; cannot prepare Chromium shared libraries.`,
|
||||
);
|
||||
});
|
||||
|
||||
await chromiumRuntime.inflate(runtimeArchivePath);
|
||||
chromiumRuntime.setupLambdaEnvironment(path.join(tmpdir(), "al2023", "lib"));
|
||||
};
|
||||
|
||||
await prepareChromiumSharedLibraries(serverlessChromium);
|
||||
browser = await playwrightCore.chromium.launch({
|
||||
headless: true,
|
||||
executablePath,
|
||||
args: serverlessChromium.args,
|
||||
});
|
||||
const { devices } = playwrightCore;
|
||||
desktopContext = await browser.newContext({
|
||||
...devices["Desktop Chrome"],
|
||||
});
|
||||
mobileContext = await browser.newContext({
|
||||
...devices["iPhone 11"],
|
||||
});
|
||||
|
||||
const homepage = await crawlPage(desktopContext, rootUrl, rootUrl, timeoutMs);
|
||||
if (!homepage) {
|
||||
throw new Error("Homepage konnte nicht geladen werden.");
|
||||
}
|
||||
|
||||
const requestedPages = discoverRelevantSubpageUrls(
|
||||
homepage.links.map((link) => link.href),
|
||||
rootUrl,
|
||||
);
|
||||
const crawlTargets = requestedPages.slice(0, maxPages);
|
||||
const crawledPages: PageResult[] = [homepage];
|
||||
|
||||
for (const pageUrl of crawlTargets.slice(1)) {
|
||||
const crawled = await crawlPage(desktopContext, pageUrl, rootUrl, timeoutMs);
|
||||
if (crawled) {
|
||||
crawledPages.push(crawled);
|
||||
}
|
||||
}
|
||||
|
||||
const allLinks: PersistedCrawlLink[] = crawledPages.flatMap((page) =>
|
||||
page.links.map((link) => ({
|
||||
...link,
|
||||
pageUrl: page.finalUrl,
|
||||
})),
|
||||
);
|
||||
const internalLinks = allLinks.filter((link) => link.isInternal);
|
||||
const uniqueInternalLinks = [...new Set(internalLinks.map((link) => link.href))];
|
||||
|
||||
const checkMap = new Map<
|
||||
string,
|
||||
{ status: number | null; isBroken: boolean }
|
||||
>();
|
||||
|
||||
for (const href of uniqueInternalLinks.slice(0, 30)) {
|
||||
try {
|
||||
const response = await desktopContext.request.get(href, {
|
||||
timeout: Math.max(1_000, timeoutMs - 1_000),
|
||||
});
|
||||
const status = response.status();
|
||||
checkMap.set(href, {
|
||||
status,
|
||||
isBroken: status < 200 || status >= 400,
|
||||
});
|
||||
} catch {
|
||||
checkMap.set(href, {
|
||||
status: null,
|
||||
isBroken: true,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
const desktopScreenshot = await captureHomepageScreenshot(
|
||||
ctx,
|
||||
desktopContext,
|
||||
homepage.finalUrl,
|
||||
"desktop",
|
||||
timeoutMs,
|
||||
);
|
||||
const mobileScreenshot = await captureHomepageScreenshot(
|
||||
ctx,
|
||||
mobileContext,
|
||||
homepage.finalUrl,
|
||||
"mobile",
|
||||
timeoutMs,
|
||||
);
|
||||
|
||||
const technicalInput = buildTechnicalChecks({
|
||||
rootUrl,
|
||||
finalUrl: homepage.finalUrl,
|
||||
title: homepage.title,
|
||||
metaDescription: homepage.metaDescription,
|
||||
visibleText: homepage.visibleText,
|
||||
checkedUrls: crawledPages.map((page) => page.finalUrl),
|
||||
links: allLinks.map((link) => {
|
||||
const check = checkMap.get(link.href);
|
||||
return {
|
||||
href: link.href,
|
||||
status: check?.status ?? undefined,
|
||||
statusCode: check?.status ?? undefined,
|
||||
isBroken: check?.isBroken,
|
||||
};
|
||||
}),
|
||||
});
|
||||
|
||||
const validCandidates = deduplicateLeadEmailCandidates(
|
||||
crawledPages.flatMap((page) => page.emailCandidates),
|
||||
);
|
||||
const persistedLinks = deduplicateCrawlLinks(allLinks).slice(
|
||||
0,
|
||||
MAX_PERSISTED_LINKS,
|
||||
);
|
||||
const persistedCandidates = validCandidates.slice(
|
||||
0,
|
||||
MAX_PERSISTED_EMAIL_CANDIDATES,
|
||||
);
|
||||
const usable = getUsableContactEmailFromEntries(
|
||||
validCandidates.map((candidate) => ({
|
||||
email: candidate.email,
|
||||
emailSource: candidate.emailSource,
|
||||
contactPerson: candidate.contactPerson,
|
||||
isBusinessContactAddress: candidate.isBusinessContactAddress,
|
||||
})),
|
||||
);
|
||||
|
||||
await ctx.runMutation(internal.websiteEnrichment.persistLeadEnrichmentResult, {
|
||||
runId,
|
||||
leadId: started.lead._id,
|
||||
pages: crawledPages.map((page) => ({
|
||||
sourceUrl: page.sourceUrl,
|
||||
finalUrl: page.finalUrl,
|
||||
pageKind: page.pageKind,
|
||||
title: page.title,
|
||||
metaDescription: page.metaDescription,
|
||||
headings: page.headings,
|
||||
visibleTextExcerpt: trimExcerpt(page.visibleText),
|
||||
hasContactFormSignal: page.hasContactFormSignal,
|
||||
hasContactCtaSignal: page.hasContactCtaSignal,
|
||||
})),
|
||||
links: persistedLinks.map((link) => ({
|
||||
pageUrl: link.pageUrl,
|
||||
href: link.href,
|
||||
text: link.text,
|
||||
isInternal: link.isInternal,
|
||||
isBroken: checkMap.get(link.href)?.isBroken,
|
||||
})),
|
||||
emailCandidates: persistedCandidates.map((candidate) => ({
|
||||
email: candidate.email,
|
||||
normalizedEmail: candidate.normalizedEmail,
|
||||
emailSource: candidate.emailSource,
|
||||
sourceUrl: candidate.sourceUrl,
|
||||
contactPerson: candidate.contactPerson ?? undefined,
|
||||
isBusinessContactAddress: candidate.isBusinessContactAddress,
|
||||
isGeneric: candidate.isGeneric,
|
||||
accepted:
|
||||
usable !== null && candidate.normalizedEmail === usable.email,
|
||||
})),
|
||||
screenshots: [
|
||||
...(desktopScreenshot ? [desktopScreenshot] : []),
|
||||
...(mobileScreenshot ? [mobileScreenshot] : []),
|
||||
],
|
||||
technicalChecks: [
|
||||
{
|
||||
sourceUrl: homepage.sourceUrl,
|
||||
finalUrl: homepage.finalUrl,
|
||||
usesHttps: technicalInput.https,
|
||||
missingTitle: technicalInput.missingTitle,
|
||||
missingMetaDescription: technicalInput.missingMetaDescription,
|
||||
hasVisibleContactPath: technicalInput.hasVisibleContactPath,
|
||||
brokenInternalLinkCount: technicalInput.brokenInternalLinks.length,
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
if (usable) {
|
||||
await ctx.runMutation(internal.websiteEnrichment.patchLeadFromWebsiteEnrichment, {
|
||||
leadId: started.lead._id,
|
||||
email: usable.email,
|
||||
emailSource: usable.emailSource ?? undefined,
|
||||
contactPerson: usable.contactPerson ?? undefined,
|
||||
currentContactStatus: started.lead.contactStatus,
|
||||
});
|
||||
} else {
|
||||
await ctx.runMutation(internal.websiteEnrichment.patchLeadFromWebsiteEnrichment, {
|
||||
leadId: started.lead._id,
|
||||
currentContactStatus: started.lead.contactStatus,
|
||||
contactStatusReason:
|
||||
"Kein verwertbarer Kontakt auf der Website gefunden.",
|
||||
});
|
||||
}
|
||||
|
||||
await ctx.runMutation(internal.websiteEnrichment.finishLeadEnrichmentRun, {
|
||||
runId,
|
||||
status: "succeeded",
|
||||
currentStep: "website_enrichment",
|
||||
errors: 0,
|
||||
});
|
||||
|
||||
await ctx.runMutation(api.runs.appendEvent, {
|
||||
runId,
|
||||
level: "info",
|
||||
message: usable
|
||||
? "Website-Enrichment erfolgreich mit nutzbarer E-Mail abgeschlossen."
|
||||
: "Website-Enrichment abgeschlossen, aber ohne nutzbare E-Mail.",
|
||||
});
|
||||
|
||||
return runId;
|
||||
} catch (error) {
|
||||
const errorSummary = messageFromError(error);
|
||||
|
||||
await ctx.runMutation(internal.websiteEnrichment.finishLeadEnrichmentRun, {
|
||||
runId,
|
||||
status: "failed",
|
||||
currentStep: "website_enrichment",
|
||||
errorSummary,
|
||||
errors: 1,
|
||||
});
|
||||
|
||||
await ctx.runMutation(api.runs.appendEvent, {
|
||||
runId,
|
||||
level: "error",
|
||||
message: "Website-Enrichment fehlgeschlagen.",
|
||||
details: [
|
||||
{ label: "Fehler", value: errorSummary, source: "website_enrichment" },
|
||||
],
|
||||
});
|
||||
|
||||
if (started) {
|
||||
await ctx.runMutation(internal.websiteEnrichment.patchLeadFromWebsiteEnrichment, {
|
||||
leadId: started.lead._id,
|
||||
currentContactStatus: started.lead.contactStatus,
|
||||
contactStatusReason: `Website-Enrichment fehlgeschlagen: ${errorSummary}`,
|
||||
});
|
||||
}
|
||||
|
||||
return null;
|
||||
} finally {
|
||||
if (desktopContext) {
|
||||
await desktopContext.close();
|
||||
}
|
||||
if (mobileContext) {
|
||||
await mobileContext.close();
|
||||
}
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
},
|
||||
});
|
||||
Reference in New Issue
Block a user