{
  "openapi": "3.1.0",
  "info": {
    "title": "AutoChunk API",
    "summary": "Permission-aware chunks for AI retrieval.",
    "description": "AutoChunk turns business documents (PDF, HTML, DOCX, plain text) into AI-retrieval-ready chunks tagged with department, access level, source URL, and per-principal permissions. The data layer for compliance-aware RAG.\n\nThis specification covers the customer-facing `/api/v1/*` endpoints. Internal plumbing routes (`/api/playground/chunk`, `/api/signup-request`) are intentionally omitted.",
    "version": "1.0.0",
    "termsOfService": "https://autochunk.ai/docs",
    "contact": {
      "name": "AutoChunk Support",
      "email": "hello@autochunk.ai",
      "url": "https://autochunk.ai/docs"
    },
    "license": { "name": "Proprietary" }
  },
  "servers": [{ "url": "https://autochunk.ai", "description": "Production" }],
  "externalDocs": {
    "description": "Full documentation",
    "url": "https://autochunk.ai/docs"
  },
  "security": [{ "ApiKeyAuth": [] }],
  "tags": [
    {
      "name": "Chunking",
      "description": "Convert text into permission-tagged chunks."
    },
    {
      "name": "Extraction",
      "description": "Convert binary documents into clean UTF-8 text."
    }
  ],
  "paths": {
    "/api/v1/chunk": {
      "post": {
        "tags": ["Chunking"],
        "operationId": "chunkText",
        "summary": "Chunk text with permission tags",
        "description": "Token-window chunking with paragraph-aware boundaries. Each returned chunk inherits department, access_level, and source_url from the parent source row, plus per-principal ACL rows from the optional `permissions[]` array. Persists `sources`, `chunks`, and `chunk_permissions` to the AutoChunk database for retrieval-time filtering.",
        "requestBody": {
          "required": true,
          "content": {
            "application/json": { "schema": { "$ref": "#/components/schemas/ChunkRequest" } }
          }
        },
        "responses": {
          "200": {
            "description": "Chunks created and persisted.",
            "content": {
              "application/json": { "schema": { "$ref": "#/components/schemas/ChunkResponse" } }
            }
          },
          "400": {
            "description": "Validation error (`invalid_json`, `invalid_request`, or `empty_content`).",
            "content": {
              "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" } }
            }
          },
          "401": { "$ref": "#/components/responses/Unauthorized" },
          "429": { "$ref": "#/components/responses/RateLimited" },
          "500": { "$ref": "#/components/responses/ServerError" }
        }
      },
      "get": {
        "tags": ["Chunking"],
        "operationId": "chunkMethodNotAllowed",
        "summary": "Method not allowed (POST only)",
        "responses": {
          "405": {
            "description": "GET is not supported. Use POST with a JSON body.",
            "content": {
              "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" } }
            }
          }
        }
      }
    },
    "/api/v1/extract": {
      "post": {
        "tags": ["Extraction"],
        "operationId": "extractFile",
        "summary": "Extract clean text from PDF / HTML / DOCX / plain text",
        "description": "Multipart file upload. Detects format via magic bytes → MIME → extension, then dispatches to a per-format extractor (`unpdf` for PDF, `cheerio` for HTML, `mammoth` for DOCX, passthrough for plain text). 10MB file size cap. Same authentication and rate-limit posture as `/api/v1/chunk`.",
        "requestBody": {
          "required": true,
          "content": {
            "multipart/form-data": {
              "schema": {
                "type": "object",
                "required": ["file"],
                "properties": {
                  "file": {
                    "type": "string",
                    "format": "binary",
                    "description": "The file to extract. Up to 10MB. PDF, HTML, DOCX, or plain text."
                  },
                  "format": {
                    "type": "string",
                    "enum": ["pdf", "html", "docx", "text"],
                    "description": "Optional format override. Useful when autodetection guesses wrong."
                  }
                }
              }
            }
          }
        },
        "responses": {
          "200": {
            "description": "Extracted text and metadata.",
            "content": {
              "application/json": { "schema": { "$ref": "#/components/schemas/ExtractResponse" } }
            }
          },
          "400": {
            "description": "`invalid_content_type` (not multipart) or `missing_file` (no file field).",
            "content": {
              "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" } }
            }
          },
          "401": { "$ref": "#/components/responses/Unauthorized" },
          "413": {
            "description": "`payload_too_large` — file exceeds 10MB.",
            "content": {
              "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" } }
            }
          },
          "415": {
            "description": "`unsupported_format` — file type couldn't be detected.",
            "content": {
              "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" } }
            }
          },
          "422": {
            "description": "`extraction_failed` (parser couldn't read the file) or `empty_output` (parsed but produced no text — typically a scanned PDF).",
            "content": {
              "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" } }
            }
          },
          "429": { "$ref": "#/components/responses/RateLimited" },
          "500": { "$ref": "#/components/responses/ServerError" }
        }
      }
    }
  },
  "components": {
    "securitySchemes": {
      "ApiKeyAuth": {
        "type": "apiKey",
        "in": "header",
        "name": "x-api-key",
        "description": "Invite-only key from RedHub AI. Format: `rh_live_<48 hex chars>`. SHA-256 hashed server-side; raw key shown to customer once at creation."
      }
    },
    "schemas": {
      "PrincipalType": {
        "type": "string",
        "enum": ["user", "group", "role", "department"]
      },
      "Permission": { "type": "string", "enum": ["read", "write"] },
      "AccessLevel": {
        "type": "string",
        "enum": ["public", "internal", "restricted", "confidential"]
      },
      "SourceType": {
        "type": "string",
        "enum": ["pdf", "webpage", "crm", "sop", "ticket", "transcript", "text", "other"]
      },
      "PermissionEntry": {
        "type": "object",
        "required": ["principal_type", "principal_id", "permission"],
        "properties": {
          "principal_type": { "$ref": "#/components/schemas/PrincipalType" },
          "principal_id": {
            "type": "string",
            "minLength": 1,
            "maxLength": 256,
            "description": "Arbitrary identifier from the customer's IDP (user id, group id, role name, etc.)."
          },
          "permission": { "$ref": "#/components/schemas/Permission" }
        }
      },
      "ChunkRequest": {
        "type": "object",
        "required": ["source", "content"],
        "properties": {
          "source": {
            "type": "object",
            "required": ["type"],
            "properties": {
              "type": { "$ref": "#/components/schemas/SourceType" },
              "url": { "type": "string", "format": "uri" },
              "title": { "type": "string", "maxLength": 512 },
              "department": {
                "type": "string",
                "maxLength": 64,
                "description": "Suggested values: finance, legal, hr, sales, ops, engineering. Free-form."
              },
              "access_level": { "$ref": "#/components/schemas/AccessLevel" },
              "permissions": {
                "type": "array",
                "items": { "$ref": "#/components/schemas/PermissionEntry" },
                "maxItems": 256
              },
              "metadata": {
                "type": "object",
                "additionalProperties": true,
                "description": "Free-form caller metadata stored alongside the source row."
              }
            }
          },
          "content": {
            "type": "string",
            "minLength": 1,
            "maxLength": 2000000,
            "description": "Pre-extracted UTF-8 text. For binary inputs, call POST /api/v1/extract first."
          },
          "options": {
            "type": "object",
            "properties": {
              "chunk_size": {
                "type": "integer",
                "minimum": 64,
                "maximum": 4096,
                "default": 512,
                "description": "Target tokens per chunk."
              },
              "chunk_overlap": {
                "type": "integer",
                "minimum": 0,
                "maximum": 1024,
                "default": 50,
                "description": "Overlap tokens between consecutive chunks. Must be less than chunk_size."
              },
              "summarize": {
                "type": "boolean",
                "default": false,
                "description": "Reserved for a future LLM summarization feature; currently a no-op. summary will always be null in the response."
              }
            }
          }
        }
      },
      "Chunk": {
        "type": "object",
        "required": [
          "chunk_id",
          "source_id",
          "chunk_text",
          "summary",
          "department",
          "access_level",
          "source_url",
          "token_count",
          "embedding_ready"
        ],
        "properties": {
          "chunk_id": { "type": "string", "format": "uuid" },
          "source_id": { "type": "string", "format": "uuid" },
          "chunk_text": { "type": "string" },
          "summary": { "type": ["string", "null"] },
          "department": { "type": ["string", "null"] },
          "access_level": {
            "oneOf": [
              { "$ref": "#/components/schemas/AccessLevel" },
              { "type": "null" }
            ]
          },
          "source_url": { "type": ["string", "null"], "format": "uri" },
          "token_count": { "type": "integer", "minimum": 0 },
          "embedding_ready": { "type": "boolean" }
        }
      },
      "ChunkResponse": {
        "type": "object",
        "required": ["source_id", "chunk_count", "total_tokens", "chunks"],
        "properties": {
          "source_id": { "type": "string", "format": "uuid" },
          "chunk_count": { "type": "integer", "minimum": 0 },
          "total_tokens": { "type": "integer", "minimum": 0 },
          "chunks": { "type": "array", "items": { "$ref": "#/components/schemas/Chunk" } }
        }
      },
      "ExtractResponse": {
        "type": "object",
        "required": ["source_id", "format", "text", "metadata"],
        "properties": {
          "source_id": { "type": "string" },
          "format": { "type": "string", "enum": ["pdf", "html", "docx", "text"] },
          "text": { "type": "string" },
          "metadata": {
            "type": "object",
            "required": ["characters", "words", "extraction_method", "tokens"],
            "properties": {
              "pages": { "type": "integer", "description": "PDFs only." },
              "characters": { "type": "integer" },
              "words": { "type": "integer" },
              "extraction_method": {
                "type": "string",
                "enum": ["unpdf", "cheerio", "mammoth", "passthrough"]
              },
              "tokens": { "type": "integer" }
            }
          }
        }
      },
      "ErrorResponse": {
        "type": "object",
        "required": ["error"],
        "properties": {
          "error": {
            "type": "object",
            "required": ["code", "message"],
            "properties": {
              "code": {
                "type": "string",
                "examples": [
                  "invalid_json",
                  "invalid_request",
                  "empty_content",
                  "missing_api_key",
                  "invalid_api_key",
                  "key_disabled",
                  "monthly_quota_exceeded",
                  "rate_limit_exceeded",
                  "internal_error",
                  "auth_unavailable",
                  "invalid_content_type",
                  "missing_file",
                  "payload_too_large",
                  "unsupported_format",
                  "extraction_failed",
                  "empty_output",
                  "method_not_allowed"
                ]
              },
              "message": { "type": "string" },
              "issues": {
                "type": "array",
                "description": "Present on 400 invalid_request. Each issue describes one Zod validation failure.",
                "items": {
                  "type": "object",
                  "properties": {
                    "code": { "type": "string" },
                    "path": { "type": "array", "items": { "type": ["string", "number"] } },
                    "message": { "type": "string" }
                  }
                }
              }
            }
          }
        }
      }
    },
    "responses": {
      "Unauthorized": {
        "description": "`missing_api_key`, `invalid_api_key`, or `key_disabled`.",
        "content": {
          "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" } }
        }
      },
      "RateLimited": {
        "description": "`rate_limit_exceeded` (per-minute) or `monthly_quota_exceeded`. The per-minute response includes a `Retry-After: 60` header.",
        "headers": {
          "Retry-After": {
            "description": "Seconds the client should wait before retrying (per-minute rate limit only).",
            "schema": { "type": "integer" }
          }
        },
        "content": {
          "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" } }
        }
      },
      "ServerError": {
        "description": "`internal_error` (unhandled exception, captured by Sentry) or `auth_unavailable` (Supabase reachability issue — retry with backoff).",
        "content": {
          "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" } }
        }
      }
    }
  }
}
