diff --git a/apps/sim/app/api/organizations/[id]/data-retention/route.ts b/apps/sim/app/api/organizations/[id]/data-retention/route.ts index 65e291a00d3..e67d229df1a 100644 --- a/apps/sim/app/api/organizations/[id]/data-retention/route.ts +++ b/apps/sim/app/api/organizations/[id]/data-retention/route.ts @@ -1,37 +1,40 @@ import { AuditAction, AuditResourceType, recordAudit } from '@sim/audit' import { db } from '@sim/db' +import type { DataRetentionSettings } from '@sim/db/schema' import { member, organization } from '@sim/db/schema' import { createLogger } from '@sim/logger' import { and, eq } from 'drizzle-orm' import { type NextRequest, NextResponse } from 'next/server' -import { updateOrganizationDataRetentionContract } from '@/lib/api/contracts/organization' +import { + type OrganizationRetentionValues, + updateOrganizationDataRetentionContract, +} from '@/lib/api/contracts/organization' import { parseRequest, validationErrorResponse } from '@/lib/api/server' import { getSession } from '@/lib/auth' -import { - CLEANUP_CONFIG, - type OrganizationRetentionSettings, -} from '@/lib/billing/cleanup-dispatcher' +import { CLEANUP_CONFIG } from '@/lib/billing/cleanup-dispatcher' import { isOrganizationOnEnterprisePlan } from '@/lib/billing/core/subscription' import { isBillingEnabled } from '@/lib/core/config/env-flags' import { withRouteHandler } from '@/lib/core/utils/with-route-handler' const logger = createLogger('DataRetentionAPI') -function enterpriseDefaults(): OrganizationRetentionSettings { +function enterpriseDefaults(): OrganizationRetentionValues { return { logRetentionHours: CLEANUP_CONFIG['cleanup-logs'].defaults.enterprise, softDeleteRetentionHours: CLEANUP_CONFIG['cleanup-soft-deletes'].defaults.enterprise, taskCleanupHours: CLEANUP_CONFIG['cleanup-tasks'].defaults.enterprise, + piiRedaction: null, } } function normalizeConfigured( - settings: Partial | null | undefined -): OrganizationRetentionSettings { + settings: DataRetentionSettings | null | undefined +): OrganizationRetentionValues { return { logRetentionHours: settings?.logRetentionHours ?? null, softDeleteRetentionHours: settings?.softDeleteRetentionHours ?? null, taskCleanupHours: settings?.taskCleanupHours ?? null, + piiRedaction: settings?.piiRedaction?.rules ? { rules: settings.piiRedaction.rules } : null, } } @@ -152,7 +155,7 @@ export const PUT = withRouteHandler( } const current = normalizeConfigured(currentOrg.dataRetentionSettings) - const merged: OrganizationRetentionSettings = { ...current } + const merged: DataRetentionSettings = { ...current } if (body.logRetentionHours !== undefined) { merged.logRetentionHours = body.logRetentionHours } @@ -162,6 +165,9 @@ export const PUT = withRouteHandler( if (body.taskCleanupHours !== undefined) { merged.taskCleanupHours = body.taskCleanupHours } + if (body.piiRedaction !== undefined) { + merged.piiRedaction = body.piiRedaction + } const [updated] = await db .update(organization) diff --git a/apps/sim/ee/data-retention/components/data-retention-settings.tsx b/apps/sim/ee/data-retention/components/data-retention-settings.tsx index 5ac649517f8..be46edacce8 100644 --- a/apps/sim/ee/data-retention/components/data-retention-settings.tsx +++ b/apps/sim/ee/data-retention/components/data-retention-settings.tsx @@ -3,9 +3,24 @@ import { useEffect, useRef, useState } from 'react' import { createLogger } from '@sim/logger' import { toError } from '@sim/utils/errors' -import { Chip, ChipSelect, toast } from '@/components/emcn' +import { generateId } from '@sim/utils/id' +import { Plus } from 'lucide-react' +import { + Checkbox, + Chip, + ChipInput, + ChipModal, + ChipModalBody, + ChipModalField, + ChipModalFooter, + ChipModalHeader, + ChipSelect, + Search, + toast, +} from '@/components/emcn' import { useSession } from '@/lib/auth/auth-client' import { isBillingEnabled } from '@/lib/core/config/env-flags' +import { PII_ENTITY_GROUPS, SUPPORTED_PII_ENTITIES } from '@/lib/guardrails/pii-entities' import { getUserRole } from '@/lib/workspaces/organization/utils' import { SettingsSection } from '@/app/workspace/[workspaceId]/settings/components/settings-section/settings-section' import { InfoNote } from '@/ee/components/info-note' @@ -15,9 +30,12 @@ import { useUpdateOrganizationRetention, } from '@/ee/data-retention/hooks/data-retention' import { useOrganizations } from '@/hooks/queries/organization' +import { useWorkspacesQuery } from '@/hooks/queries/workspace' const logger = createLogger('DataRetentionSettings') +const ENTITY_LABELS = SUPPORTED_PII_ENTITIES as Record + const DAY_OPTIONS = [ { value: '1', label: '1 day' }, { value: '3', label: '3 days' }, @@ -32,6 +50,16 @@ const DAY_OPTIONS = [ { value: 'never', label: 'Forever' }, ] as const +/** + * Local editable shape of a PII redaction rule. `workspaceId: null` is the + * all-workspaces default; a non-null id is a per-workspace override of it. + */ +interface RuleDraft { + id: string + entityTypes: string[] + workspaceId: string | null +} + function hoursToDisplayDays(hours: number | null): string { if (hours === null) return 'never' return String(Math.round(hours / 24)) @@ -42,6 +70,20 @@ function daysToHours(days: string): number | null { return Number(days) * 24 } +function normalizeRule(rule: RuleDraft): string { + return JSON.stringify({ + entityTypes: [...rule.entityTypes].sort(), + workspaceId: rule.workspaceId, + }) +} + +function entitySummary(entityTypes: string[]): string { + if (entityTypes.length === 0) return 'Not redacted' + const labels = entityTypes.map((t) => ENTITY_LABELS[t] ?? t) + if (labels.length <= 3) return labels.join(', ') + return `${labels.slice(0, 3).join(', ')} +${labels.length - 3} more` +} + interface RetentionSelectProps { value: string onChange: (value: string) => void @@ -59,6 +101,144 @@ function RetentionSelect({ value, onChange }: RetentionSelectProps) { return } +interface EntityCheckboxGridProps { + selected: string[] + onChange: (entityTypes: string[]) => void +} + +function EntityCheckboxGrid({ selected, onChange }: EntityCheckboxGridProps) { + const [search, setSearch] = useState('') + const query = search.trim().toLowerCase() + + const groups = PII_ENTITY_GROUPS.map((group) => ({ + label: group.label, + entities: query + ? group.entities.filter( + (e) => e.label.toLowerCase().includes(query) || e.value.toLowerCase().includes(query) + ) + : group.entities, + })).filter((group) => group.entities.length > 0) + + const visibleValues: string[] = groups.flatMap((g) => g.entities.map((e) => e.value)) + const allVisibleSelected = + visibleValues.length > 0 && visibleValues.every((v) => selected.includes(v)) + + function toggle(value: string) { + onChange(selected.includes(value) ? selected.filter((v) => v !== value) : [...selected, value]) + } + + function toggleAllVisible() { + if (allVisibleSelected) { + onChange(selected.filter((v) => !visibleValues.includes(v))) + } else { + onChange([...new Set([...selected, ...visibleValues])]) + } + } + + return ( +
+
+ setSearch(e.target.value)} + className='min-w-0 flex-1' + /> + + {allVisibleSelected ? 'Deselect all' : 'Select all'} + +
+
+ {groups.map((group) => ( +
+ {group.label} +
+ {group.entities.map((entity) => { + const checkboxId = `pii-${entity.value}` + return ( + + ) + })} +
+
+ ))} +
+
+ ) +} + +interface RuleModalProps { + draft: RuleDraft + isNew: boolean + isSaving: boolean + /** Workspaces selectable for an override (excludes those taken by other overrides). */ + workspaceOptions: { value: string; label: string }[] + onChange: (draft: RuleDraft) => void + onClose: () => void + onSave: () => void +} + +function RuleModal({ + draft, + isNew, + isSaving, + workspaceOptions, + onChange, + onClose, + onSave, +}: RuleModalProps) { + const isDefault = draft.workspaceId === null + return ( + + + {isDefault + ? 'Default redaction · all workspaces' + : isNew + ? 'Add workspace override' + : 'Edit workspace override'} + + + {!isDefault && ( + + onChange({ ...draft, workspaceId: value })} + options={workspaceOptions} + align='start' + /> + + )} + + onChange({ ...draft, entityTypes })} + /> + + + + + ) +} + export function DataRetentionSettings() { const { data: session, isPending: sessionPending } = useSession() const { data: orgsData, isLoading: orgsLoading } = useOrganizations() @@ -68,6 +248,12 @@ export function DataRetentionSettings() { const { data, isLoading: retentionLoading } = useOrganizationRetention(orgId) const updateMutation = useUpdateOrganizationRetention() + const { data: workspaces } = useWorkspacesQuery(Boolean(orgId)) + const workspaceOptions = (workspaces ?? []) + .filter((w) => w.organizationId === orgId) + .map((w) => ({ value: w.id, label: w.name })) + const workspaceName = (id: string) => + workspaceOptions.find((w) => w.value === id)?.label ?? 'Unknown workspace' const userEmail = session?.user?.email const userRole = getUserRole(activeOrganization, userEmail) @@ -76,11 +262,18 @@ export function DataRetentionSettings() { const [logDays, setLogDays] = useState('') const [softDeleteDays, setSoftDeleteDays] = useState('') const [taskCleanupDays, setTaskCleanupDays] = useState('') - const [savedLogDays, setSavedLogDays] = useState('') - const [savedSoftDeleteDays, setSavedSoftDeleteDays] = useState('') - const [savedTaskCleanupDays, setSavedTaskCleanupDays] = useState('') + const [savedHours, setSavedHours] = useState('') + const [rules, setRules] = useState([]) + const [modalDraft, setModalDraft] = useState(null) + const [modalOriginal, setModalOriginal] = useState(null) + const [modalIsNew, setModalIsNew] = useState(false) + const [showUnsaved, setShowUnsaved] = useState(false) const formInitializedRef = useRef(false) + function hoursSnapshot(log: string, soft: string, task: string): string { + return JSON.stringify({ log, soft, task }) + } + useEffect(() => { if (!data || formInitializedRef.current) return const log = hoursToDisplayDays(data.effective.logRetentionHours) @@ -89,18 +282,119 @@ export function DataRetentionSettings() { setLogDays(log) setSoftDeleteDays(soft) setTaskCleanupDays(task) - setSavedLogDays(log) - setSavedSoftDeleteDays(soft) - setSavedTaskCleanupDays(task) + setSavedHours(hoursSnapshot(log, soft, task)) + setRules( + (data.configured.piiRedaction?.rules ?? []).map((r) => ({ + id: r.id, + entityTypes: r.entityTypes, + workspaceId: r.workspaceId, + })) + ) formInitializedRef.current = true }, [data]) - const hasChanges = - logDays !== savedLogDays || - softDeleteDays !== savedSoftDeleteDays || - taskCleanupDays !== savedTaskCleanupDays + const hoursChanged = hoursSnapshot(logDays, softDeleteDays, taskCleanupDays) !== savedHours + const modalChanged = + modalDraft !== null && + modalOriginal !== null && + normalizeRule(modalDraft) !== normalizeRule(modalOriginal) - async function handleSave() { + const defaultRule = rules.find((r) => r.workspaceId === null) ?? null + const overrideRules = rules.filter((r) => r.workspaceId !== null) + const takenWorkspaceIds = new Set(overrideRules.map((r) => r.workspaceId as string)) + const freeWorkspaces = workspaceOptions.filter((w) => !takenWorkspaceIds.has(w.value)) + + /** Workspaces selectable for `draft` — excludes workspaces taken by OTHER overrides. */ + function overrideOptionsForDraft(draft: RuleDraft): { value: string; label: string }[] { + const otherTaken = new Set( + rules + .filter((r) => r.id !== draft.id && r.workspaceId !== null) + .map((r) => r.workspaceId as string) + ) + return workspaceOptions.filter((w) => !otherTaken.has(w.value)) + } + + async function persistRules(nextRules: RuleDraft[]) { + if (!orgId) return + await updateMutation.mutateAsync({ + orgId, + settings: { + piiRedaction: { + rules: nextRules.map((r) => ({ + id: r.id, + entityTypes: r.entityTypes, + workspaceId: r.workspaceId, + })), + }, + }, + }) + setRules(nextRules) + } + + function openEditDefault() { + const rule: RuleDraft = defaultRule ?? { id: generateId(), entityTypes: [], workspaceId: null } + setModalIsNew(defaultRule === null) + setModalOriginal(rule) + setModalDraft({ ...rule }) + } + + function openAddOverride() { + const workspaceId = freeWorkspaces[0]?.value + if (!workspaceId) return + const blank: RuleDraft = { id: generateId(), entityTypes: [], workspaceId } + setModalIsNew(true) + setModalOriginal(blank) + setModalDraft(blank) + } + + function openEditOverride(rule: RuleDraft) { + setModalIsNew(false) + setModalOriginal(rule) + setModalDraft({ ...rule }) + } + + function clearModal() { + setModalDraft(null) + setModalOriginal(null) + setShowUnsaved(false) + } + + function requestCloseModal() { + if (modalChanged) { + setShowUnsaved(true) + } else { + clearModal() + } + } + + async function saveModalRule() { + if (!modalDraft) return + const next = rules.some((r) => r.id === modalDraft.id) + ? rules.map((r) => (r.id === modalDraft.id ? modalDraft : r)) + : [...rules, modalDraft] + try { + await persistRules(next) + clearModal() + toast.success('PII redaction saved.') + } catch (error) { + const msg = toError(error).message + logger.error('Failed to save PII redaction', { error: msg }) + toast.error(msg) + } + } + + async function removeRule(id: string) { + try { + await persistRules(rules.filter((r) => r.id !== id)) + toast.success('PII redaction updated.') + } catch (error) { + const msg = toError(error).message + logger.error('Failed to update PII redaction', { error: msg }) + toast.error(msg) + } + } + + async function handleSaveHours() { if (!orgId) return try { await updateMutation.mutateAsync({ @@ -111,9 +405,7 @@ export function DataRetentionSettings() { taskCleanupHours: daysToHours(taskCleanupDays), }, }) - setSavedLogDays(logDays) - setSavedSoftDeleteDays(softDeleteDays) - setSavedTaskCleanupDays(taskCleanupDays) + setSavedHours(hoursSnapshot(logDays, softDeleteDays, taskCleanupDays)) toast.success('Data retention settings saved.') } catch (error) { const msg = toError(error).message @@ -165,8 +457,8 @@ export function DataRetentionSettings() {
{updateMutation.isPending ? 'Saving...' : 'Save'} @@ -197,8 +489,125 @@ export function DataRetentionSettings() {
+ +
+
+
+ + Default · all workspaces + + {!defaultRule && ( + + Add redaction + + )} +
+ {defaultRule && ( +
+ + {entitySummary(defaultRule.entityTypes)} + +
+ Edit + removeRule(defaultRule.id)} + disabled={updateMutation.isPending} + > + Delete + +
+
+ )} +
+ {defaultRule && ( +
+
+ + Workspace overrides + + + Add override + +
+ {overrideRules.length === 0 ? ( +

+ No overrides — every workspace uses the default. +

+ ) : ( +
+ {overrideRules.map((rule) => ( +
+
+ + {workspaceName(rule.workspaceId as string)} + + + {entitySummary(rule.entityTypes)} + +
+
+ openEditOverride(rule)}>Edit + removeRule(rule.id)} + disabled={updateMutation.isPending} + > + Delete + +
+
+ ))} + + Workspaces not listed use the default. + +
+ )} +
+ )} +
+
+ {modalDraft && ( + + )} + + setShowUnsaved(false)}>Unsaved changes + +

+ You have unsaved changes. Save them before closing? +

+
+ setShowUnsaved(false)} + cancelDisabled={updateMutation.isPending} + secondaryActions={[{ label: 'Discard', onClick: clearModal, variant: 'destructive' }]} + primaryAction={{ + label: updateMutation.isPending ? 'Saving...' : 'Save', + onClick: saveModalRule, + disabled: updateMutation.isPending, + }} + /> +
) } diff --git a/apps/sim/ee/data-retention/hooks/data-retention.ts b/apps/sim/ee/data-retention/hooks/data-retention.ts index 6a8e39cc066..f1e85a890a2 100644 --- a/apps/sim/ee/data-retention/hooks/data-retention.ts +++ b/apps/sim/ee/data-retention/hooks/data-retention.ts @@ -6,6 +6,7 @@ import { getOrganizationDataRetentionContract, type OrganizationDataRetention, type OrganizationRetentionValues, + type UpdateOrganizationDataRetentionBody, updateOrganizationDataRetentionContract, } from '@/lib/api/contracts/organization' @@ -39,7 +40,7 @@ export function useOrganizationRetention(orgId: string | undefined) { interface UpdateRetentionVariables { orgId: string - settings: Partial + settings: UpdateOrganizationDataRetentionBody } export function useUpdateOrganizationRetention() { diff --git a/apps/sim/lib/api/contracts/organization.ts b/apps/sim/lib/api/contracts/organization.ts index eaa4c3dbf75..fb8b1a1033c 100644 --- a/apps/sim/lib/api/contracts/organization.ts +++ b/apps/sim/lib/api/contracts/organization.ts @@ -1,5 +1,9 @@ import { z } from 'zod' -import { workspaceIdSchema } from '@/lib/api/contracts/primitives' +import { + type PiiRedactionSettings, + piiRedactionSettingsSchema, + workspaceIdSchema, +} from '@/lib/api/contracts/primitives' import { organizationBillingDataSchema } from '@/lib/api/contracts/subscription' import { defineRouteContract } from '@/lib/api/contracts/types' import { workspacePermissionSchema } from '@/lib/api/contracts/workspaces' @@ -98,16 +102,24 @@ const organizationDataRetentionHoursSchema = z .nullable() .optional() +export type { PiiRedactionSettings } + export const updateOrganizationDataRetentionBodySchema = z.object({ logRetentionHours: organizationDataRetentionHoursSchema, softDeleteRetentionHours: organizationDataRetentionHoursSchema, taskCleanupHours: organizationDataRetentionHoursSchema, + piiRedaction: piiRedactionSettingsSchema.optional(), }) +export type UpdateOrganizationDataRetentionBody = z.input< + typeof updateOrganizationDataRetentionBodySchema +> + const organizationRetentionValuesSchema = z.object({ logRetentionHours: z.number().int().nullable(), softDeleteRetentionHours: z.number().int().nullable(), taskCleanupHours: z.number().int().nullable(), + piiRedaction: piiRedactionSettingsSchema.nullable(), }) export type OrganizationRetentionValues = z.output diff --git a/apps/sim/lib/api/contracts/primitives.ts b/apps/sim/lib/api/contracts/primitives.ts index 9fac093c5af..e3e61484020 100644 --- a/apps/sim/lib/api/contracts/primitives.ts +++ b/apps/sim/lib/api/contracts/primitives.ts @@ -85,6 +85,42 @@ export const userFileSchema = z }) .passthrough() +/** A single PII redaction rule targeting one scope (all workspaces, or one). */ +export const piiRedactionRuleSchema = z.object({ + id: z.string().min(1), + name: z.string().max(100).optional(), + /** Presidio entity types to mask. Empty = redact nothing for this scope. */ + entityTypes: z.array(z.string().min(1, 'Entity type cannot be empty')).max(100), + /** null = all workspaces; otherwise the single targeted workspace. */ + workspaceId: z.string().min(1).nullable(), +}) + +export type PiiRedactionRule = z.output + +/** + * Enterprise PII redaction policy applied to workflow logs on persist. Each + * scope is unique: at most one all-workspaces rule (`workspaceId: null`) and at + * most one rule per workspace — resolution is most-specific-wins, so duplicate + * scopes would make masking depend on array order. + */ +export const piiRedactionSettingsSchema = z.object({ + rules: z + .array(piiRedactionRuleSchema) + .max(1000) + .refine( + (rules) => { + const scopes = rules.map((r) => r.workspaceId ?? '__all__') + return new Set(scopes).size === scopes.length + }, + { + message: + 'Each workspace (and the all-workspaces default) may have at most one PII redaction rule.', + } + ), +}) + +export type PiiRedactionSettings = z.output + export const booleanQueryFlagSchema = z.preprocess( (value) => { if (typeof value === 'boolean') return value diff --git a/apps/sim/lib/billing/retention.test.ts b/apps/sim/lib/billing/retention.test.ts new file mode 100644 index 00000000000..2852cb6a640 --- /dev/null +++ b/apps/sim/lib/billing/retention.test.ts @@ -0,0 +1,60 @@ +/** + * @vitest-environment node + */ +import type { DataRetentionSettings, PiiRedactionRule } from '@sim/db/schema' +import { describe, expect, it } from 'vitest' +import { resolveEffectivePiiRedaction } from '@/lib/billing/retention' + +function settings(rules: PiiRedactionRule[]): DataRetentionSettings { + return { piiRedaction: { rules } } +} + +describe('resolveEffectivePiiRedaction', () => { + const allRule: PiiRedactionRule = { + id: 'r-all', + entityTypes: ['EMAIL_ADDRESS', 'PHONE_NUMBER'], + workspaceId: null, + } + + it('applies the all-workspaces rule when the workspace has no specific rule', () => { + const result = resolveEffectivePiiRedaction({ + orgSettings: settings([allRule]), + workspaceId: 'ws-1', + }) + expect(result).toEqual({ enabled: true, entityTypes: ['EMAIL_ADDRESS', 'PHONE_NUMBER'] }) + }) + + it('lets a workspace-specific rule override the all rule', () => { + const result = resolveEffectivePiiRedaction({ + orgSettings: settings([allRule, { id: 'r-1', entityTypes: ['US_SSN'], workspaceId: 'ws-1' }]), + workspaceId: 'ws-1', + }) + expect(result).toEqual({ enabled: true, entityTypes: ['US_SSN'] }) + }) + + it('exempts a workspace when its specific rule has no entity types', () => { + const result = resolveEffectivePiiRedaction({ + orgSettings: settings([allRule, { id: 'r-1', entityTypes: [], workspaceId: 'ws-1' }]), + workspaceId: 'ws-1', + }) + expect(result).toEqual({ enabled: false, entityTypes: [] }) + }) + + it('is disabled when no rule matches and there is no all rule', () => { + const result = resolveEffectivePiiRedaction({ + orgSettings: settings([{ id: 'r-1', entityTypes: ['US_SSN'], workspaceId: 'ws-2' }]), + workspaceId: 'ws-1', + }) + expect(result).toEqual({ enabled: false, entityTypes: [] }) + }) + + it('is disabled when there are no rules', () => { + expect( + resolveEffectivePiiRedaction({ orgSettings: settings([]), workspaceId: 'ws-1' }) + ).toEqual({ enabled: false, entityTypes: [] }) + expect(resolveEffectivePiiRedaction({ orgSettings: null, workspaceId: 'ws-1' })).toEqual({ + enabled: false, + entityTypes: [], + }) + }) +}) diff --git a/apps/sim/lib/billing/retention.ts b/apps/sim/lib/billing/retention.ts new file mode 100644 index 00000000000..183dbb280e1 --- /dev/null +++ b/apps/sim/lib/billing/retention.ts @@ -0,0 +1,38 @@ +import type { DataRetentionSettings } from '@sim/db/schema' + +export interface EffectivePiiRedaction { + enabled: boolean + /** Presidio entity types to mask. Empty = redact all detected PII. */ + entityTypes: string[] +} + +export const DEFAULT_PII_REDACTION: EffectivePiiRedaction = { + enabled: false, + entityTypes: [], +} + +/** + * Resolve the effective PII redaction policy for a workspace from the org-level + * rules list, most-specific-wins (never unioned): the workspace's own rule takes + * precedence over the all-workspaces rule (`workspaceId: null`). A resolved rule + * with no entity types redacts nothing — so a workspace-specific empty rule + * exempts that workspace, overriding the all rule. Defensive about the + * loosely-typed JSON column. + */ +export function resolveEffectivePiiRedaction(params: { + orgSettings: DataRetentionSettings | null | undefined + workspaceId: string +}): EffectivePiiRedaction { + const rules = params.orgSettings?.piiRedaction?.rules + if (!Array.isArray(rules) || rules.length === 0) return DEFAULT_PII_REDACTION + + const rule = + rules.find((r) => r?.workspaceId === params.workspaceId) ?? + rules.find((r) => r?.workspaceId == null) + + const types = Array.isArray(rule?.entityTypes) + ? rule.entityTypes.filter((t): t is string => typeof t === 'string') + : [] + if (types.length === 0) return DEFAULT_PII_REDACTION + return { enabled: true, entityTypes: types } +} diff --git a/apps/sim/lib/guardrails/pii-entities.ts b/apps/sim/lib/guardrails/pii-entities.ts new file mode 100644 index 00000000000..0e67fe22ff7 --- /dev/null +++ b/apps/sim/lib/guardrails/pii-entities.ts @@ -0,0 +1,128 @@ +/** + * Client-safe catalog of Microsoft Presidio PII entity types. Single source of + * truth shared by the server-only validator (`validate_pii.ts`) and client + * settings UI — keep no node-only imports here. + */ +export const SUPPORTED_PII_ENTITIES = { + // Common/Global + CREDIT_CARD: 'Credit card number', + CRYPTO: 'Cryptocurrency wallet address', + DATE_TIME: 'Date or time', + EMAIL_ADDRESS: 'Email address', + IBAN_CODE: 'International Bank Account Number', + IP_ADDRESS: 'IP address', + NRP: 'Nationality, religious or political group', + LOCATION: 'Location', + PERSON: 'Person name', + PHONE_NUMBER: 'Phone number', + MEDICAL_LICENSE: 'Medical license number', + URL: 'URL', + VIN: 'Vehicle Identification Number', + + // USA + US_BANK_NUMBER: 'US bank account number', + US_DRIVER_LICENSE: 'US driver license', + US_ITIN: 'US Individual Taxpayer Identification Number', + US_PASSPORT: 'US passport number', + US_SSN: 'US Social Security Number', + + // UK + UK_NHS: 'UK NHS number', + UK_NINO: 'UK National Insurance Number', + + // Other countries + ES_NIF: 'Spanish NIF number', + ES_NIE: 'Spanish NIE number', + IT_FISCAL_CODE: 'Italian fiscal code', + IT_DRIVER_LICENSE: 'Italian driver license', + IT_VAT_CODE: 'Italian VAT code', + IT_PASSPORT: 'Italian passport', + IT_IDENTITY_CARD: 'Italian identity card', + PL_PESEL: 'Polish PESEL number', + SG_NRIC_FIN: 'Singapore NRIC/FIN', + SG_UEN: 'Singapore Unique Entity Number', + AU_ABN: 'Australian Business Number', + AU_ACN: 'Australian Company Number', + AU_TFN: 'Australian Tax File Number', + AU_MEDICARE: 'Australian Medicare number', + IN_PAN: 'Indian Permanent Account Number', + IN_AADHAAR: 'Indian Aadhaar number', + IN_VEHICLE_REGISTRATION: 'Indian vehicle registration', + IN_VOTER: 'Indian voter ID', + IN_PASSPORT: 'Indian passport', + FI_PERSONAL_IDENTITY_CODE: 'Finnish Personal Identity Code', + KR_RRN: 'Korean Resident Registration Number', + TH_TNIN: 'Thai National ID Number', +} as const + +export type PIIEntityType = keyof typeof SUPPORTED_PII_ENTITIES + +/** Flat `{ value, label }` options for entity-type pickers, in catalog order. */ +export const PII_ENTITY_OPTIONS: ReadonlyArray<{ value: PIIEntityType; label: string }> = + Object.entries(SUPPORTED_PII_ENTITIES).map(([value, label]) => ({ + value: value as PIIEntityType, + label, + })) + +/** Entity types grouped by region, for a grouped checkbox picker. */ +export const PII_ENTITY_GROUPS: ReadonlyArray<{ + label: string + entities: ReadonlyArray<{ value: PIIEntityType; label: string }> +}> = [ + { + label: 'Common', + entities: [ + 'PERSON', + 'EMAIL_ADDRESS', + 'PHONE_NUMBER', + 'CREDIT_CARD', + 'IP_ADDRESS', + 'LOCATION', + 'DATE_TIME', + 'URL', + 'IBAN_CODE', + 'CRYPTO', + 'NRP', + 'MEDICAL_LICENSE', + 'VIN', + ], + }, + { + label: 'United States', + entities: ['US_SSN', 'US_PASSPORT', 'US_DRIVER_LICENSE', 'US_BANK_NUMBER', 'US_ITIN'], + }, + { label: 'United Kingdom', entities: ['UK_NHS', 'UK_NINO'] }, + { + label: 'Other regions', + entities: [ + 'ES_NIF', + 'ES_NIE', + 'IT_FISCAL_CODE', + 'IT_DRIVER_LICENSE', + 'IT_VAT_CODE', + 'IT_PASSPORT', + 'IT_IDENTITY_CARD', + 'PL_PESEL', + 'SG_NRIC_FIN', + 'SG_UEN', + 'AU_ABN', + 'AU_ACN', + 'AU_TFN', + 'AU_MEDICARE', + 'IN_PAN', + 'IN_AADHAAR', + 'IN_VEHICLE_REGISTRATION', + 'IN_VOTER', + 'IN_PASSPORT', + 'FI_PERSONAL_IDENTITY_CODE', + 'KR_RRN', + 'TH_TNIN', + ], + }, +].map((group) => ({ + label: group.label, + entities: group.entities.map((value) => ({ + value: value as PIIEntityType, + label: SUPPORTED_PII_ENTITIES[value as PIIEntityType], + })), +})) diff --git a/apps/sim/lib/guardrails/validate_pii.py b/apps/sim/lib/guardrails/validate_pii.py index 570786b8d9e..d475b96e233 100644 --- a/apps/sim/lib/guardrails/validate_pii.py +++ b/apps/sim/lib/guardrails/validate_pii.py @@ -12,7 +12,7 @@ from typing import List, Dict, Any try: - from presidio_analyzer import AnalyzerEngine + from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer from presidio_anonymizer import AnonymizerEngine from presidio_anonymizer.entities import OperatorConfig except ImportError: @@ -24,6 +24,52 @@ sys.exit(0) +class VinRecognizer(PatternRecognizer): + """ + Recognizes Vehicle Identification Numbers (17 chars, A-Z/0-9 excluding + I/O/Q) and validates the ISO 3779 check digit (position 9). Validation makes + accidental matches on arbitrary 17-char codes (request ids, SKUs, tokens) + extremely unlikely. Note: some non-North-American VINs don't use the check + digit and will be skipped — an intentional bias toward precision. + """ + + _TRANSLIT = { + **{str(d): d for d in range(10)}, + "A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "H": 8, + "J": 1, "K": 2, "L": 3, "M": 4, "N": 5, "P": 7, "R": 9, + "S": 2, "T": 3, "U": 4, "V": 5, "W": 6, "X": 7, "Y": 8, "Z": 9, + } + _WEIGHTS = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2] + + def validate_result(self, pattern_text: str): + vin = pattern_text.upper() + if len(vin) != 17: + return False + try: + total = sum(self._TRANSLIT[c] * w for c, w in zip(vin, self._WEIGHTS)) + except KeyError: + return False + check = total % 11 + expected = "X" if check == 10 else str(check) + return vin[8] == expected + + +def build_analyzer() -> "AnalyzerEngine": + """ + AnalyzerEngine with custom recognizers registered on top of the Presidio + defaults. Adds a check-digit-validated VIN recognizer. + """ + analyzer = AnalyzerEngine() + vin_pattern = Pattern(name="vin", regex=r"\b[A-HJ-NPR-Z0-9]{17}\b", score=0.7) + vin_recognizer = VinRecognizer( + supported_entity="VIN", + patterns=[vin_pattern], + context=["vin", "vehicle", "chassis"], + ) + analyzer.registry.add_recognizer(vin_recognizer) + return analyzer + + def detect_pii( text: str, entity_types: List[str], @@ -44,7 +90,7 @@ def detect_pii( """ try: # Initialize Presidio engines - analyzer = AnalyzerEngine() + analyzer = build_analyzer() # Analyze text for PII results = analyzer.analyze( @@ -124,18 +170,64 @@ def detect_pii( } +def mask_batch( + texts: List[str], + entity_types: List[str], + language: str = "en" +) -> Dict[str, Any]: + """ + Mask PII across many strings in a single process, reusing one analyzer + + anonymizer instance (engine construction loads the spaCy model and is the + dominant cost). Returns masked text per input, in input order; strings with + no detected PII are returned unchanged so callers can substitute directly. + """ + analyzer = build_analyzer() + anonymizer = AnonymizerEngine() + entities = entity_types if entity_types else None + + results = [] + for text in texts: + if not text: + results.append({"maskedText": text}) + continue + analyzer_results = analyzer.analyze(text=text, entities=entities, language=language) + if not analyzer_results: + results.append({"maskedText": text}) + continue + operators = { + entity_type: OperatorConfig("replace", {"new_value": f"<{entity_type}>"}) + for entity_type in set([r.entity_type for r in analyzer_results]) + } + anonymized = anonymizer.anonymize( + text=text, + analyzer_results=analyzer_results, + operators=operators + ) + results.append({"maskedText": anonymized.text}) + + return {"passed": True, "results": results} + + def main(): """Main entry point for CLI usage""" try: # Read input from stdin input_data = sys.stdin.read() data = json.loads(input_data) - - text = data.get("text", "") + entity_types = data.get("entityTypes", []) - mode = data.get("mode", "block") language = data.get("language", "en") - + + # Batch mask mode: an array of texts processed with one warm engine pair. + if "texts" in data: + texts = data.get("texts", []) + result = mask_batch(texts, entity_types, language) + print(f"__SIM_RESULT__={json.dumps(result)}") + return + + text = data.get("text", "") + mode = data.get("mode", "block") + # Validate inputs if not text: result = { @@ -145,7 +237,7 @@ def main(): } else: result = detect_pii(text, entity_types, mode, language) - + # Output result with marker for parsing print(f"__SIM_RESULT__={json.dumps(result)}") diff --git a/apps/sim/lib/guardrails/validate_pii.ts b/apps/sim/lib/guardrails/validate_pii.ts index 7f1ca2a89cd..ba6886bb92d 100644 --- a/apps/sim/lib/guardrails/validate_pii.ts +++ b/apps/sim/lib/guardrails/validate_pii.ts @@ -6,6 +6,13 @@ import { createLogger } from '@sim/logger' const logger = createLogger('PIIValidator') const DEFAULT_TIMEOUT = 30000 // 30 seconds +/** + * Max total bytes of text sent to a single Presidio subprocess. spaCy NER is the + * bottleneck, so large payloads are split into multiple short calls instead of + * one that risks the 30s timeout. + */ +const PII_CHUNK_MAX_BYTES = 256 * 1024 + export interface PIIValidationInput { text: string entityTypes: string[] // e.g., ["PERSON", "EMAIL_ADDRESS", "CREDIT_CARD"] @@ -70,6 +77,126 @@ export async function validatePII(input: PIIValidationInput): Promise { + if (texts.length === 0) return [] + + const chunks: string[][] = [] + let current: string[] = [] + let currentBytes = 0 + for (const text of texts) { + const bytes = Buffer.byteLength(text, 'utf8') + if (current.length > 0 && currentBytes + bytes > PII_CHUNK_MAX_BYTES) { + chunks.push(current) + current = [] + currentBytes = 0 + } + current.push(text) + currentBytes += bytes + } + if (current.length > 0) chunks.push(current) + + const masked: string[] = [] + for (const chunk of chunks) { + const result = await runPythonScript({ + texts: chunk, + entityTypes, + mode: 'mask', + language, + }) + if (!result.passed || !result.results || result.results.length !== chunk.length) { + throw new Error(result.error || 'PII batch masking returned an unexpected result') + } + for (const item of result.results) masked.push(item.maskedText) + } + + return masked +} + +/** + * Spawn the Presidio Python script, write the payload to stdin as JSON, and parse + * the `__SIM_RESULT__=` marker from stdout. Rejects on non-zero exit, timeout, + * spawn failure, or a missing/unparseable marker. + */ +function runPythonScript(payload: Record): Promise { + return new Promise((resolve, reject) => { + const guardrailsDir = path.join(process.cwd(), 'lib/guardrails') + const scriptPath = path.join(guardrailsDir, 'validate_pii.py') + const venvPython = path.join(guardrailsDir, 'venv/bin/python3') + const pythonCmd = fs.existsSync(venvPython) ? venvPython : 'python3' + + const python = spawn(pythonCmd, [scriptPath]) + let stdout = '' + let stderr = '' + + const timeout = setTimeout(() => { + python.kill() + reject(new Error('PII processing timeout')) + }, DEFAULT_TIMEOUT) + + // stdin errors (e.g. EPIPE when the child exits before draining the payload — + // chunks can exceed the OS pipe buffer) emit on stdin, not the process. Without + // a listener Node throws an unhandled 'error' and crashes; funnel it into the + // promise so the caller's fail-safe scrub path handles it. + python.stdin.on('error', (error: Error) => { + clearTimeout(timeout) + reject(new Error(`PII script stdin error: ${error.message}`)) + }) + python.stdin.write(JSON.stringify(payload)) + python.stdin.end() + python.stdout.on('data', (data) => { + stdout += data.toString() + }) + python.stderr.on('data', (data) => { + stderr += data.toString() + }) + + python.on('close', (code) => { + clearTimeout(timeout) + if (code !== 0) { + reject(new Error(stderr || `PII script exited with code ${code}`)) + return + } + const prefix = '__SIM_RESULT__=' + const marker = stdout.split('\n').find((l) => l.startsWith(prefix)) + if (!marker) { + reject(new Error(`No result marker in PII script output: ${stdout.substring(0, 200)}`)) + return + } + try { + resolve(JSON.parse(marker.slice(prefix.length)) as T) + } catch (error: any) { + reject(new Error(`Failed to parse PII script result: ${error.message}`)) + } + }) + + python.on('error', (error) => { + clearTimeout(timeout) + reject( + new Error( + `Failed to execute Python: ${error.message}. Make sure Python 3 and Presidio are installed.` + ) + ) + }) + }) +} + /** * Execute Python PII detection script */ @@ -107,6 +234,12 @@ async function executePythonPIIDetection( mode, language, }) + // See runPythonScript: stdin errors (EPIPE on early child exit) must be + // caught here or Node throws an unhandled 'error' and crashes the process. + python.stdin.on('error', (error: Error) => { + clearTimeout(timeout) + reject(new Error(`Failed to write to Python: ${error.message}`)) + }) python.stdin.write(inputData) python.stdin.end() @@ -184,59 +317,4 @@ async function executePythonPIIDetection( }) } -/** - * List of all supported PII entity types - * Based on Microsoft Presidio's supported entities - */ -export const SUPPORTED_PII_ENTITIES = { - // Common/Global - CREDIT_CARD: 'Credit card number', - CRYPTO: 'Cryptocurrency wallet address', - DATE_TIME: 'Date or time', - EMAIL_ADDRESS: 'Email address', - IBAN_CODE: 'International Bank Account Number', - IP_ADDRESS: 'IP address', - NRP: 'Nationality, religious or political group', - LOCATION: 'Location', - PERSON: 'Person name', - PHONE_NUMBER: 'Phone number', - MEDICAL_LICENSE: 'Medical license number', - URL: 'URL', - - // USA - US_BANK_NUMBER: 'US bank account number', - US_DRIVER_LICENSE: 'US driver license', - US_ITIN: 'US Individual Taxpayer Identification Number', - US_PASSPORT: 'US passport number', - US_SSN: 'US Social Security Number', - - // UK - UK_NHS: 'UK NHS number', - UK_NINO: 'UK National Insurance Number', - - // Other countries - ES_NIF: 'Spanish NIF number', - ES_NIE: 'Spanish NIE number', - IT_FISCAL_CODE: 'Italian fiscal code', - IT_DRIVER_LICENSE: 'Italian driver license', - IT_VAT_CODE: 'Italian VAT code', - IT_PASSPORT: 'Italian passport', - IT_IDENTITY_CARD: 'Italian identity card', - PL_PESEL: 'Polish PESEL number', - SG_NRIC_FIN: 'Singapore NRIC/FIN', - SG_UEN: 'Singapore Unique Entity Number', - AU_ABN: 'Australian Business Number', - AU_ACN: 'Australian Company Number', - AU_TFN: 'Australian Tax File Number', - AU_MEDICARE: 'Australian Medicare number', - IN_PAN: 'Indian Permanent Account Number', - IN_AADHAAR: 'Indian Aadhaar number', - IN_VEHICLE_REGISTRATION: 'Indian vehicle registration', - IN_VOTER: 'Indian voter ID', - IN_PASSPORT: 'Indian passport', - FI_PERSONAL_IDENTITY_CODE: 'Finnish Personal Identity Code', - KR_RRN: 'Korean Resident Registration Number', - TH_TNIN: 'Thai National ID Number', -} as const - -export type PIIEntityType = keyof typeof SUPPORTED_PII_ENTITIES +export { type PIIEntityType, SUPPORTED_PII_ENTITIES } from '@/lib/guardrails/pii-entities' diff --git a/apps/sim/lib/logs/execution/logger.ts b/apps/sim/lib/logs/execution/logger.ts index e8c6edd7c55..9a77a0ec427 100644 --- a/apps/sim/lib/logs/execution/logger.ts +++ b/apps/sim/lib/logs/execution/logger.ts @@ -1,11 +1,13 @@ import { db } from '@sim/db' import { member, + organization, usageLog, userStats, user as userTable, workflow, workflowExecutionLogs, + workspace, } from '@sim/db/schema' import { createLogger } from '@sim/logger' import { getErrorMessage } from '@sim/utils/errors' @@ -24,6 +26,7 @@ import { recordUsage, stableEventKey, } from '@/lib/billing/core/usage-log' +import { resolveEffectivePiiRedaction } from '@/lib/billing/retention' import { checkAndBillOverageThreshold } from '@/lib/billing/threshold-billing' import { isBillingEnabled } from '@/lib/core/config/env-flags' import { redactApiKeys } from '@/lib/core/security/redaction' @@ -32,6 +35,7 @@ import { collectLargeValueReferenceKeys, replaceLargeValueReferenceKeysWithClient, } from '@/lib/execution/payloads/large-value-metadata' +import { type RedactablePayload, redactPIIFromExecution } from '@/lib/logs/execution/pii-redaction' import { snapshotService } from '@/lib/logs/execution/snapshot/service' import { externalizeExecutionData, @@ -585,6 +589,37 @@ export class ExecutionLogger implements IExecutionLoggerService { } } + /** + * Mask PII from log content before persistence when the execution's workspace + * (via workspace override or org default) has enterprise PII redaction enabled. + * Resolved at persist time so both the inline and externalized write paths are + * covered. Returns the payload unchanged when disabled or non-enterprise. + */ + private async applyPiiRedaction( + workspaceId: string | null, + payload: RedactablePayload + ): Promise { + if (!workspaceId) return payload + + const [row] = await db + .select({ orgSettings: organization.dataRetentionSettings }) + .from(workspace) + .leftJoin(organization, eq(organization.id, workspace.organizationId)) + .where(eq(workspace.id, workspaceId)) + .limit(1) + if (!row) return payload + + // Rules are only writable by enterprise orgs (route-gated), so an enabled + // rule already implies entitlement. We deliberately do NOT re-check + // `isWorkspaceOnEnterprisePlan` here: it returns false on transient lookup + // errors, which would silently skip masking and leak PII (fail-open). When + // rules are present we always redact (fail-safe; over-redaction at worst). + const config = resolveEffectivePiiRedaction({ orgSettings: row.orgSettings, workspaceId }) + if (!config.enabled) return payload + + return redactPIIFromExecution(payload, { entityTypes: config.entityTypes }) + } + async completeWorkflowExecution(params: { executionId: string endedAt: string @@ -720,6 +755,26 @@ export class ExecutionLogger implements IExecutionLoggerService { const redactedWorkflowInput = filteredWorkflowInput !== undefined ? redactApiKeys(filteredWorkflowInput) : undefined + const pii = await this.applyPiiRedaction(existingLog?.workspaceId ?? null, { + traceSpans: redactedTraceSpans, + finalOutput: redactedFinalOutput, + ...(redactedWorkflowInput !== undefined ? { workflowInput: redactedWorkflowInput } : {}), + ...(builtExecutionData.error !== undefined ? { error: builtExecutionData.error } : {}), + ...(builtExecutionData.completionFailure !== undefined + ? { completionFailure: builtExecutionData.completionFailure } + : {}), + ...(builtExecutionData.trigger !== undefined ? { trigger: builtExecutionData.trigger } : {}), + ...(builtExecutionData.executionState !== undefined + ? { executionState: builtExecutionData.executionState } + : {}), + ...(builtExecutionData.environment !== undefined + ? { environment: builtExecutionData.environment } + : {}), + ...(builtExecutionData.correlation !== undefined + ? { correlation: builtExecutionData.correlation } + : {}), + }) + const rawDurationMs = isResume && existingLog?.startedAt ? new Date(endedAt).getTime() - new Date(existingLog.startedAt).getTime() @@ -731,9 +786,23 @@ export class ExecutionLogger implements IExecutionLoggerService { const cleanExecutionData: ExecutionData = { ...builtExecutionData, - traceSpans: redactedTraceSpans, - finalOutput: redactedFinalOutput, - ...(redactedWorkflowInput !== undefined ? { workflowInput: redactedWorkflowInput } : {}), + traceSpans: pii.traceSpans as TraceSpan[], + finalOutput: pii.finalOutput as BlockOutputData, + ...(pii.workflowInput !== undefined ? { workflowInput: pii.workflowInput } : {}), + ...(pii.error !== undefined ? { error: pii.error as string } : {}), + ...(pii.completionFailure !== undefined + ? { completionFailure: pii.completionFailure as string } + : {}), + ...(pii.trigger !== undefined ? { trigger: pii.trigger as ExecutionTrigger } : {}), + ...(pii.executionState !== undefined + ? { executionState: pii.executionState as SerializableExecutionState } + : {}), + ...(pii.environment !== undefined + ? { environment: pii.environment as ExecutionEnvironment } + : {}), + ...(pii.correlation !== undefined + ? { correlation: pii.correlation as ExecutionData['correlation'] } + : {}), } stripSpanCosts((cleanExecutionData as Record).traceSpans) diff --git a/apps/sim/lib/logs/execution/pii-redaction.test.ts b/apps/sim/lib/logs/execution/pii-redaction.test.ts new file mode 100644 index 00000000000..dccbc59cc38 --- /dev/null +++ b/apps/sim/lib/logs/execution/pii-redaction.test.ts @@ -0,0 +1,120 @@ +/** + * @vitest-environment node + */ +import { beforeEach, describe, expect, it, vi } from 'vitest' + +const { mockMaskPIIBatch } = vi.hoisted(() => ({ + mockMaskPIIBatch: vi.fn(), +})) + +vi.mock('@/lib/guardrails/validate_pii', () => ({ + maskPIIBatch: mockMaskPIIBatch, +})) + +import { REDACTION_FAILED_MARKER, redactPIIFromExecution } from '@/lib/logs/execution/pii-redaction' + +describe('redactPIIFromExecution', () => { + beforeEach(() => { + vi.clearAllMocks() + // Default: echo each input uppercased so we can assert substitution by position. + mockMaskPIIBatch.mockImplementation(async (texts: string[]) => texts.map((t) => `MASKED(${t})`)) + }) + + it('collects and masks string leaves recursively, preserving structure', async () => { + const payload = { + traceSpans: [ + { + blockId: 'b1', + status: 'success', + input: { email: 'a@b.com' }, + output: { text: 'hello' }, + children: [{ blockId: 'c1', output: { nested: 'deep' } }], + }, + ], + finalOutput: { answer: 'world' }, + workflowInput: 'start', + } + + const result = await redactPIIFromExecution(payload, { entityTypes: ['EMAIL_ADDRESS'] }) + + const span = (result.traceSpans as any[])[0] + expect(span.blockId).toBe('b1') + expect(span.status).toBe('success') + expect(span.input.email).toBe('MASKED(a@b.com)') + expect(span.output.text).toBe('MASKED(hello)') + expect(span.children[0].output.nested).toBe('MASKED(deep)') + expect((result.finalOutput as any).answer).toBe('MASKED(world)') + expect(result.workflowInput).toBe('MASKED(start)') + expect(mockMaskPIIBatch).toHaveBeenCalledTimes(1) + expect(mockMaskPIIBatch.mock.calls[0][0]).toEqual([ + 'a@b.com', + 'hello', + 'deep', + 'world', + 'start', + ]) + }) + + it('does not mutate the original payload', async () => { + const payload = { finalOutput: { answer: 'world' } } + await redactPIIFromExecution(payload, { entityTypes: [] }) + expect(payload.finalOutput.answer).toBe('world') + }) + + it('scrubs all eligible strings when masking throws (no leak)', async () => { + mockMaskPIIBatch.mockRejectedValueOnce(new Error('presidio down')) + const payload = { + traceSpans: [{ output: { text: 'secret@x.com' } }], + finalOutput: 'another secret', + } + + const result = await redactPIIFromExecution(payload, { entityTypes: [] }) + + expect((result.traceSpans as any[])[0].output.text).toBe(REDACTION_FAILED_MARKER) + expect(result.finalOutput).toBe(REDACTION_FAILED_MARKER) + }) + + it('masks large strings too (never left unredacted)', async () => { + const big = 'x'.repeat(200 * 1024) + const payload = { finalOutput: { big, small: 'pii' } } + + const result = await redactPIIFromExecution(payload, { entityTypes: [] }) + + expect((result.finalOutput as any).big).toBe(`MASKED(${big})`) + expect((result.finalOutput as any).small).toBe('MASKED(pii)') + expect(mockMaskPIIBatch.mock.calls[0][0]).toEqual([big, 'pii']) + }) + + it('masks span error/errorMessage and top-level error, trigger, executionState, environment', async () => { + const payload = { + traceSpans: [{ blockId: 'b1', error: 'failed for bob@x.com', errorMessage: 'bad input z' }], + error: 'run failed: a@b.com', + completionFailure: 'cancelled by c@d.com', + trigger: { type: 'webhook', data: { from: 'caller@x.com' } }, + executionState: { status: 'completed', note: 'state for e@f.com' }, + environment: { variables: { CONTACT: 'admin@x.com' } }, + correlation: { source: 'corr@x.com' }, + } + + const result = await redactPIIFromExecution(payload, { entityTypes: ['EMAIL_ADDRESS'] }) + + const span = (result.traceSpans as any[])[0] + expect(span.blockId).toBe('b1') + expect(span.error).toBe('MASKED(failed for bob@x.com)') + expect(span.errorMessage).toBe('MASKED(bad input z)') + expect(result.error).toBe('MASKED(run failed: a@b.com)') + expect(result.completionFailure).toBe('MASKED(cancelled by c@d.com)') + expect((result.trigger as any).type).toBe('MASKED(webhook)') + expect((result.trigger as any).data.from).toBe('MASKED(caller@x.com)') + expect((result.executionState as any).note).toBe('MASKED(state for e@f.com)') + expect((result.environment as any).variables.CONTACT).toBe('MASKED(admin@x.com)') + expect((result.correlation as any).source).toBe('MASKED(corr@x.com)') + }) + + it('returns payload unchanged when there is nothing to mask', async () => { + const payload = { traceSpans: [{ blockId: 'b1', count: 5 }] } + const result = await redactPIIFromExecution(payload, { entityTypes: [] }) + expect(result).toBe(payload) + expect(mockMaskPIIBatch).not.toHaveBeenCalled() + }) +}) diff --git a/apps/sim/lib/logs/execution/pii-redaction.ts b/apps/sim/lib/logs/execution/pii-redaction.ts new file mode 100644 index 00000000000..7b4794fd483 --- /dev/null +++ b/apps/sim/lib/logs/execution/pii-redaction.ts @@ -0,0 +1,181 @@ +import { createLogger } from '@sim/logger' +import { getErrorMessage } from '@sim/utils/errors' + +const logger = createLogger('PiiRedaction') + +/** Replaces text we could not safely mask, so PII is never persisted on failure. */ +export const REDACTION_FAILED_MARKER = '[REDACTION_FAILED]' + +/** + * Upper bound on total text masked for one execution. Beyond this we scrub the + * whole payload rather than spend minutes in NER (never leave it unmasked). + * Typical inline logs (≤3MB) stay well under. Individual strings are never + * skipped by size — they would otherwise persist unredacted. + */ +const PII_MAX_TOTAL_BYTES = 16 * 1024 * 1024 + +export interface PiiRedactionOptions { + /** Presidio entity types to mask. Empty = redact all detected PII. */ + entityTypes: string[] + language?: string +} + +export interface RedactablePayload { + traceSpans?: unknown + finalOutput?: unknown + workflowInput?: unknown + error?: unknown + completionFailure?: unknown + trigger?: unknown + executionState?: unknown + environment?: unknown + correlation?: unknown +} + +/** Keys of {@link RedactablePayload} processed by the redactor, in order. */ +const REDACTABLE_KEYS: (keyof RedactablePayload)[] = [ + 'traceSpans', + 'finalOutput', + 'workflowInput', + 'error', + 'completionFailure', + 'trigger', + 'executionState', + 'environment', + 'correlation', +] + +/** Trace-span fields that carry runtime content (and therefore possible PII). */ +const SPAN_CONTENT_FIELDS = [ + 'input', + 'output', + 'thinking', + 'modelToolCalls', + 'toolCalls', + 'error', + 'errorMessage', +] as const + +function isEligibleString(value: string): boolean { + return value.length > 0 +} + +/** + * Rebuild `value` replacing every eligible string leaf with `handle(leaf)`. + * Used for both collection (handle records and returns the input) and + * substitution (handle returns the masked value), so traversal order and + * eligibility are guaranteed identical across the two passes. + */ +function transformStrings(value: unknown, handle: (s: string) => string): unknown { + if (typeof value === 'string') { + return isEligibleString(value) ? handle(value) : value + } + if (Array.isArray(value)) { + return value.map((item) => transformStrings(item, handle)) + } + if (value !== null && typeof value === 'object') { + const out: Record = {} + for (const [key, v] of Object.entries(value)) { + out[key] = transformStrings(v, handle) + } + return out + } + return value +} + +/** + * Redact a trace span: only its content fields ({@link SPAN_CONTENT_FIELDS}) and + * nested `children` are walked, leaving structural metadata (blockId, name, + * status, timing) untouched so log correlation/display is preserved. + */ +function transformSpan(span: unknown, handle: (s: string) => string): unknown { + if (span === null || typeof span !== 'object' || Array.isArray(span)) { + return transformStrings(span, handle) + } + const source = span as Record + const out: Record = { ...source } + for (const field of SPAN_CONTENT_FIELDS) { + if (field in out) out[field] = transformStrings(out[field], handle) + } + if (Array.isArray(source.children)) { + out.children = source.children.map((child) => transformSpan(child, handle)) + } + return out +} + +function transformUnit( + key: keyof RedactablePayload, + value: unknown, + handle: (s: string) => string +): unknown { + if (key === 'traceSpans' && Array.isArray(value)) { + return value.map((span) => transformSpan(span, handle)) + } + return transformStrings(value, handle) +} + +/** + * Mask PII across an execution's `traceSpans` / `finalOutput` / `workflowInput`. + * + * All eligible string leaves are collected in one deterministic pass and masked + * in a single batched (byte-chunked) Presidio call — so subprocess count scales + * with payload size, not block count. Each unit is then rebuilt independently + * from the masked slice, preserving the JSON structure (Presidio never sees the + * envelope). On a hard masking failure or when the payload exceeds the ceiling, + * eligible strings are replaced with {@link REDACTION_FAILED_MARKER} rather than + * left unredacted — PII is never persisted on the failure path. + */ +export async function redactPIIFromExecution( + payload: RedactablePayload, + options: PiiRedactionOptions +): Promise { + const { entityTypes } = options + const language = options.language ?? 'en' + + const units = REDACTABLE_KEYS.filter((key) => payload[key] !== undefined).map((key) => ({ + key, + value: payload[key], + })) + + const collected: string[] = [] + let totalBytes = 0 + for (const unit of units) { + transformUnit(unit.key, unit.value, (s) => { + collected.push(s) + totalBytes += Buffer.byteLength(s, 'utf8') + return s + }) + } + + if (collected.length === 0) return payload + + let masked: string[] + if (totalBytes > PII_MAX_TOTAL_BYTES) { + logger.warn('Execution exceeds PII redaction ceiling; scrubbing text', { + totalBytes, + ceiling: PII_MAX_TOTAL_BYTES, + }) + masked = collected.map(() => REDACTION_FAILED_MARKER) + } else { + try { + // Lazy import keeps the Python-spawning guardrails module (child_process + + // a `lib/guardrails` dir reference) out of the static middleware/RSC graph; + // it's only loaded at runtime on the Node log-persist path. + const { maskPIIBatch } = await import('@/lib/guardrails/validate_pii') + masked = await maskPIIBatch(collected, entityTypes, language) + } catch (error) { + logger.error('PII masking failed; scrubbing text to avoid leaking PII', { + error: getErrorMessage(error), + stringCount: collected.length, + }) + masked = collected.map(() => REDACTION_FAILED_MARKER) + } + } + + let index = 0 + const result: RedactablePayload = { ...payload } + for (const unit of units) { + result[unit.key] = transformUnit(unit.key, unit.value, () => masked[index++]) + } + return result +} diff --git a/packages/db/schema.ts b/packages/db/schema.ts index 54366a6a024..ddcbc39011c 100644 --- a/packages/db/schema.ts +++ b/packages/db/schema.ts @@ -1063,6 +1063,37 @@ export const chat = pgTable( } ) +/** + * A single PII redaction rule. Lives in the org-level + * {@link DataRetentionSettings.piiRedaction} rules list. Each rule targets one + * scope — all workspaces (`workspaceId: null`) or a single workspace — and + * `workspaceId` is unique across rules. Resolution is most-specific-wins: a + * workspace's own rule overrides the all-workspaces rule (never unioned). + */ +export interface PiiRedactionRule { + id: string + name?: string + /** Presidio entity types to mask. Empty = redact nothing for this scope. */ + entityTypes: string[] + /** `null` = all workspaces; otherwise the single targeted workspace. */ + workspaceId: string | null +} + +/** + * Org-level data retention + governance settings. Retention-hours fall back to + * plan defaults when unset. `piiRedaction.rules` are org-scoped; each rule + * selects which workspaces it applies to. + */ +export interface DataRetentionSettings { + logRetentionHours?: number | null + softDeleteRetentionHours?: number | null + taskCleanupHours?: number | null + /** Enterprise PII redaction rules applied to workflow logs on persist. */ + piiRedaction?: { + rules?: PiiRedactionRule[] + } | null +} + export const organization = pgTable('organization', { id: text('id').primaryKey(), name: text('name').notNull(), @@ -1082,11 +1113,7 @@ export const organization = pgTable('organization', { privacyUrl?: string hidePoweredBySim?: boolean }>(), - dataRetentionSettings: json('data_retention_settings').$type<{ - logRetentionHours?: number | null - softDeleteRetentionHours?: number | null - taskCleanupHours?: number | null - }>(), + dataRetentionSettings: json('data_retention_settings').$type(), orgUsageLimit: decimal('org_usage_limit'), /** * Storage upload/delete hot-path tracker for org-scoped plans.