GCC Build OSv0
/api

DocumentDocument

retention: operational

An uploaded artefact stored in R2, chunked and embedded for retrieval, classified for visibility control. Source-tracked: every chunk carries provenance metadata. Per-document classification per Q1 (4-tier in Phase 1A; PDPL alignment lands in Phase 1B).

RACI requirements

decision_maker
n/a
approver
n/a
reviewer
n/a
contributor
n/a
informed
optional

JSON Schema

{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "https://gcc.bootminds.com/ontology/node-types/document.json",
  "title": "Document",
  "description": "An uploaded artefact stored in R2, chunked and embedded for retrieval, classified for visibility control. Source-tracked: every chunk carries provenance metadata. Per-document classification per Q1 (4-tier in Phase 1A; PDPL alignment lands in Phase 1B).",
  "type": "object",
  "x-node-type": "Document",
  "x-lifecycle-state-machine": null,
  "x-raci-requirements": {
    "decision_maker": "n/a",
    "approver": "n/a",
    "reviewer": "n/a",
    "contributor": "n/a",
    "informed": "optional"
  },
  "x-retention-class": "operational",
  "required": [
    "node_id",
    "node_type",
    "tenant_id",
    "attributes",
    "state",
    "created_at",
    "created_by"
  ],
  "properties": {
    "node_id": {
      "type": "string",
      "format": "uuid"
    },
    "node_type": {
      "const": "Document"
    },
    "tenant_id": {
      "type": "string",
      "format": "uuid"
    },
    "state": {
      "type": "string",
      "enum": [
        "uploaded",
        "parsing",
        "embedding",
        "ready",
        "failed",
        "archived"
      ],
      "description": "Document processing lifecycle. A formal state machine may emerge later; for Phase 0 the enum is authoritative and transitions are governed by the parse/embed queue pipeline."
    },
    "attributes": {
      "type": "object",
      "additionalProperties": false,
      "required": [
        "filename",
        "mime_type",
        "r2_key",
        "classification",
        "uploaded_by_ref",
        "uploaded_at",
        "embedding_model",
        "embedding_model_version",
        "pii_scan_status"
      ],
      "properties": {
        "filename": {
          "type": "string",
          "minLength": 1,
          "description": "Original filename at upload."
        },
        "mime_type": {
          "type": "string",
          "minLength": 1,
          "description": "MIME type as detected on upload (e.g., 'application/pdf', 'text/plain', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')."
        },
        "r2_key": {
          "type": "string",
          "minLength": 1,
          "description": "R2 object key for the immutable source artefact. Tenant-scoped key prefix enforced by I1 typed binding constructor."
        },
        "classification": {
          "type": "string",
          "enum": [
            "Public",
            "Internal",
            "Confidential",
            "Restricted"
          ],
          "description": "Q1: 4-tier classification. Drives retrieval visibility; Restricted documents are filtered from AI retrieval sets for non-Advisor roles. PDPL alignment in Phase 1B may extend or remap this scheme via event-sourced reclassification (DocumentReclassified event)."
        },
        "uploaded_by_ref": {
          "type": "string",
          "format": "uuid",
          "description": "Stakeholder UUID of the uploader."
        },
        "uploaded_at": {
          "type": "string",
          "format": "date-time",
          "description": "Upload timestamp."
        },
        "embedding_model": {
          "type": "string",
          "minLength": 1,
          "description": "I8: name of the embedding model used (e.g., '@cf/baai/bge-large-en-v1.5'). Recorded for provenance and reproducibility."
        },
        "embedding_model_version": {
          "type": "string",
          "minLength": 1,
          "description": "I8: version identifier of the embedding model. Recorded for provenance and reproducibility."
        },
        "pii_scan_status": {
          "type": "string",
          "enum": [
            "NotScanned",
            "Clean",
            "Flagged"
          ],
          "description": "Result of the PII / secrets scan performed on upload (1A.10.5). Flagged documents may require reclassification or redaction before retrieval is enabled."
        },
        "workstream_tag": {
          "type": [
            "string",
            "null"
          ],
          "format": "uuid",
          "description": "Optional workstream UUID associated with this document for tag-based retrieval scoping. Stored on each chunk's vector metadata as well."
        },
        "question_tag": {
          "type": [
            "string",
            "null"
          ],
          "format": "uuid",
          "description": "Optional question UUID associated with this document (e.g., uploaded in direct response to a question). Stored on each chunk's vector metadata as well."
        },
        "chunk_count": {
          "type": [
            "integer",
            "null"
          ],
          "minimum": 0,
          "description": "Number of chunks produced after parsing. Null until state >= ready."
        },
        "byte_size": {
          "type": [
            "integer",
            "null"
          ],
          "minimum": 0,
          "description": "Size in bytes of the source artefact."
        }
      }
    },
    "created_at": {
      "type": "string",
      "format": "date-time"
    },
    "created_by": {
      "type": "string",
      "format": "uuid"
    },
    "updated_at": {
      "type": "string",
      "format": "date-time"
    }
  }
}