DocumentDocument
retention: operational
An uploaded artefact stored in R2, chunked and embedded for retrieval, classified for visibility control. Source-tracked: every chunk carries provenance metadata. Per-document classification per Q1 (4-tier in Phase 1A; PDPL alignment lands in Phase 1B).
RACI requirements
decision_maker- n/a
approver- n/a
reviewer- n/a
contributor- n/a
informed- optional
JSON Schema
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://gcc.bootminds.com/ontology/node-types/document.json",
"title": "Document",
"description": "An uploaded artefact stored in R2, chunked and embedded for retrieval, classified for visibility control. Source-tracked: every chunk carries provenance metadata. Per-document classification per Q1 (4-tier in Phase 1A; PDPL alignment lands in Phase 1B).",
"type": "object",
"x-node-type": "Document",
"x-lifecycle-state-machine": null,
"x-raci-requirements": {
"decision_maker": "n/a",
"approver": "n/a",
"reviewer": "n/a",
"contributor": "n/a",
"informed": "optional"
},
"x-retention-class": "operational",
"required": [
"node_id",
"node_type",
"tenant_id",
"attributes",
"state",
"created_at",
"created_by"
],
"properties": {
"node_id": {
"type": "string",
"format": "uuid"
},
"node_type": {
"const": "Document"
},
"tenant_id": {
"type": "string",
"format": "uuid"
},
"state": {
"type": "string",
"enum": [
"uploaded",
"parsing",
"embedding",
"ready",
"failed",
"archived"
],
"description": "Document processing lifecycle. A formal state machine may emerge later; for Phase 0 the enum is authoritative and transitions are governed by the parse/embed queue pipeline."
},
"attributes": {
"type": "object",
"additionalProperties": false,
"required": [
"filename",
"mime_type",
"r2_key",
"classification",
"uploaded_by_ref",
"uploaded_at",
"embedding_model",
"embedding_model_version",
"pii_scan_status"
],
"properties": {
"filename": {
"type": "string",
"minLength": 1,
"description": "Original filename at upload."
},
"mime_type": {
"type": "string",
"minLength": 1,
"description": "MIME type as detected on upload (e.g., 'application/pdf', 'text/plain', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')."
},
"r2_key": {
"type": "string",
"minLength": 1,
"description": "R2 object key for the immutable source artefact. Tenant-scoped key prefix enforced by I1 typed binding constructor."
},
"classification": {
"type": "string",
"enum": [
"Public",
"Internal",
"Confidential",
"Restricted"
],
"description": "Q1: 4-tier classification. Drives retrieval visibility; Restricted documents are filtered from AI retrieval sets for non-Advisor roles. PDPL alignment in Phase 1B may extend or remap this scheme via event-sourced reclassification (DocumentReclassified event)."
},
"uploaded_by_ref": {
"type": "string",
"format": "uuid",
"description": "Stakeholder UUID of the uploader."
},
"uploaded_at": {
"type": "string",
"format": "date-time",
"description": "Upload timestamp."
},
"embedding_model": {
"type": "string",
"minLength": 1,
"description": "I8: name of the embedding model used (e.g., '@cf/baai/bge-large-en-v1.5'). Recorded for provenance and reproducibility."
},
"embedding_model_version": {
"type": "string",
"minLength": 1,
"description": "I8: version identifier of the embedding model. Recorded for provenance and reproducibility."
},
"pii_scan_status": {
"type": "string",
"enum": [
"NotScanned",
"Clean",
"Flagged"
],
"description": "Result of the PII / secrets scan performed on upload (1A.10.5). Flagged documents may require reclassification or redaction before retrieval is enabled."
},
"workstream_tag": {
"type": [
"string",
"null"
],
"format": "uuid",
"description": "Optional workstream UUID associated with this document for tag-based retrieval scoping. Stored on each chunk's vector metadata as well."
},
"question_tag": {
"type": [
"string",
"null"
],
"format": "uuid",
"description": "Optional question UUID associated with this document (e.g., uploaded in direct response to a question). Stored on each chunk's vector metadata as well."
},
"chunk_count": {
"type": [
"integer",
"null"
],
"minimum": 0,
"description": "Number of chunks produced after parsing. Null until state >= ready."
},
"byte_size": {
"type": [
"integer",
"null"
],
"minimum": 0,
"description": "Size in bytes of the source artefact."
}
}
},
"created_at": {
"type": "string",
"format": "date-time"
},
"created_by": {
"type": "string",
"format": "uuid"
},
"updated_at": {
"type": "string",
"format": "date-time"
}
}
}