Database & Storage Schema

⚠️ Important (Aurora Nexus) : la source de vérité du schéma est aurora_schema.sql + les migrations migrations/NN_*.sql (appliquées via bash ops/init_db.sh).

Ce fichier contient un snapshot historique (dump pg_dump --schema-only) qui peut être en décalage avec la version courante. Il est conservé pour référence, mais ne doit pas être utilisé comme baseline “déploiement”.

This document contains an historical snapshot (pg_dump --schema-only). The current source of truth is aurora_schema.sql + numbered migrations in migrations/ (NN_*.sql).

Addendum (janvier 2026) — conversations multi-tours (follow-ups)

Certaines fonctionnalités “conversationnelles” reposent sur deux éléments DB (voir migrations récentes) :

1) `thread_state` (état minimal par conversation)

Table : public.thread_state

But : stocker un outline non dangereux (pas du contenu de document) pour résoudre des références de follow-up du type “S2 / point B / 2e point”.

Champs utiles (résumé) :

tenant_id (uuid), caller_app (text), thread_id (text) — clé primaire
last_outline (jsonb) : { "sections": [{"id":"S1","title":"..."}, ...] }
last_doc_ids / last_titles (jsonb) : docs utilisés lors du dernier livrable
last_answer_query_id (uuid) : id de requête (queries_log.id) ayant produit l’outline
updated_at (timestamptz)

2) `queries_log` (observabilité follow-up)

Des champs complémentaires permettent de diagnostiquer la qualité du pipeline :

thread_id
retrieval_query_final, retrieval_queries_tried, retrieval_retry_count
condense_used, condense_model
followup_resolved, followup_section_id, followup_section_title, followup_ref_type

Pour la définition exacte, se référer à :

aurora_schema.sql (baseline)
migrations/42_*, migrations/43_*, migrations/44_* (additifs)

1. PostgreSQL Schema

This schema is generated directly from the production database using pg_dump --schema-only.

--
-- PostgreSQL database dump
--

-- Dumped from database version 16.10 (Debian 16.10-1.pgdg13+1)
-- Dumped by pg_dump version 16.10

SET statement_timeout = 0;
SET lock_timeout = 0;
SET idle_in_transaction_session_timeout = 0;
SET client_encoding = 'UTF8';
SET standard_conforming_strings = on;
SELECT pg_catalog.set_config('search_path', '', false);
SET check_function_bodies = false;
SET xmloption = content;
SET client_min_messages = warning;
SET row_security = off;

--
-- Name: btree_gin; Type: EXTENSION; Schema: -; Owner: -
--

CREATE EXTENSION IF NOT EXISTS btree_gin WITH SCHEMA public;


--
-- Name: EXTENSION btree_gin; Type: COMMENT; Schema: -; Owner: 
--

COMMENT ON EXTENSION btree_gin IS 'support for indexing common datatypes in GIN';


--
-- Name: pg_trgm; Type: EXTENSION; Schema: -; Owner: -
--

CREATE EXTENSION IF NOT EXISTS pg_trgm WITH SCHEMA public;


--
-- Name: EXTENSION pg_trgm; Type: COMMENT; Schema: -; Owner: 
--

COMMENT ON EXTENSION pg_trgm IS 'text similarity measurement and index searching based on trigrams';


--
-- Name: pgcrypto; Type: EXTENSION; Schema: -; Owner: -
--

CREATE EXTENSION IF NOT EXISTS pgcrypto WITH SCHEMA public;


--
-- Name: EXTENSION pgcrypto; Type: COMMENT; Schema: -; Owner: 
--

COMMENT ON EXTENSION pgcrypto IS 'cryptographic functions';


--
-- Name: asset_kind; Type: TYPE; Schema: public; Owner: sylvain
--

CREATE TYPE public.asset_kind AS ENUM (
    'image',
    'table',
    'figure',
    'docling_json',
    'markdown',
    'docling_analysis_json'
);


ALTER TYPE public.asset_kind OWNER TO sylvain;

--
-- Name: distance_metric; Type: TYPE; Schema: public; Owner: sylvain
--

CREATE TYPE public.distance_metric AS ENUM (
    'cosine',
    'euclid',
    'dot'
);


ALTER TYPE public.distance_metric OWNER TO sylvain;

--
-- Name: doc_status; Type: TYPE; Schema: public; Owner: sylvain
--

CREATE TYPE public.doc_status AS ENUM (
    'new',
    'parsed',
    'chunked',
    'indexed',
    'error'
);


ALTER TYPE public.doc_status OWNER TO sylvain;

--
-- Name: job_status; Type: TYPE; Schema: public; Owner: sylvain
--

CREATE TYPE public.job_status AS ENUM (
    'queued',
    'running',
    'done',
    'failed',
    'skipped'
);


ALTER TYPE public.job_status OWNER TO sylvain;

--
-- Name: source_type; Type: TYPE; Schema: public; Owner: sylvain
--

CREATE TYPE public.source_type AS ENUM (
    'file',
    'web',
    'gdrive',
    'webdav',
    'email',
    'api',
    'doc'
);


ALTER TYPE public.source_type OWNER TO sylvain;

--
-- Name: get_document_metadata_base(uuid); Type: FUNCTION; Schema: public; Owner: sylvain
--

CREATE FUNCTION public.get_document_metadata_base(p_document_id uuid) RETURNS jsonb
    LANGUAGE sql STABLE
    AS $$
  SELECT
    (vd.metadata_base || jsonb_build_object(
      'ingestion_job_id', lj.ingestion_job_id
    ))
  FROM v_documents_metadata vd
  LEFT JOIN v_last_job_per_document lj ON lj.document_id = vd.document_id
  WHERE vd.document_id = p_document_id;
$$;


ALTER FUNCTION public.get_document_metadata_base(p_document_id uuid) OWNER TO sylvain;

--
-- Name: set_updated_at(); Type: FUNCTION; Schema: public; Owner: sylvain
--

CREATE FUNCTION public.set_updated_at() RETURNS trigger
    LANGUAGE plpgsql
    AS $$
BEGIN
  NEW.updated_at := now();
  RETURN NEW;
END $$;


ALTER FUNCTION public.set_updated_at() OWNER TO sylvain;

--
-- Name: to_epoch_ms(timestamp with time zone); Type: FUNCTION; Schema: public; Owner: sylvain
--

CREATE FUNCTION public.to_epoch_ms(ts timestamp with time zone) RETURNS bigint
    LANGUAGE sql IMMUTABLE
    AS $_$
  SELECT CASE WHEN $1 IS NULL THEN NULL ELSE FLOOR(EXTRACT(EPOCH FROM $1) * 1000)::BIGINT END;
$_$;


ALTER FUNCTION public.to_epoch_ms(ts timestamp with time zone) OWNER TO sylvain;

SET default_tablespace = '';

SET default_table_access_method = heap;

--
-- Name: api_keys; Type: TABLE; Schema: public; Owner: sylvain
--

CREATE TABLE public.api_keys (
    id uuid DEFAULT gen_random_uuid() NOT NULL,
    tenant_id uuid NOT NULL,
    provider text NOT NULL,
    label text,
    key_cipher text NOT NULL,
    meta jsonb,
    created_at timestamp with time zone DEFAULT now() NOT NULL,
    last_used_at timestamp with time zone
);


ALTER TABLE public.api_keys OWNER TO sylvain;

--
-- Name: artifacts; Type: TABLE; Schema: public; Owner: sylvain
--

CREATE TABLE public.artifacts (
    id uuid DEFAULT gen_random_uuid() NOT NULL,
    document_id uuid NOT NULL,
    kind public.asset_kind NOT NULL,
    page_no integer,
    uri_s3 text NOT NULL,
    extra jsonb,
    created_at timestamp with time zone DEFAULT now() NOT NULL,
    size_bytes bigint
);


ALTER TABLE public.artifacts OWNER TO sylvain;

--
-- Name: documents; Type: TABLE; Schema: public; Owner: sylvain
--

CREATE TABLE public.documents (
    id uuid DEFAULT gen_random_uuid() NOT NULL,
    tenant_id uuid NOT NULL,
    source_type public.source_type DEFAULT 'file'::public.source_type,
    uri_s3 text,
    external_id text,
    source_app text,
    project text,
    workspace text,
    notebook_id text,
    note_id text,
    sensitivity text,
    title text,
    filename text,
    file_ext text,
    mime_type text,
    byte_size bigint,
    checksum text,
    lang text,
    page_count integer,
    doc_version text,
    tags text[],
    status public.doc_status DEFAULT 'new'::public.doc_status NOT NULL,
    error_text text,
    docling_json_path text,
    markdown_path text,
    cold_storage_bucket text,
    created_at timestamp with time zone DEFAULT now() NOT NULL,
    updated_at timestamp with time zone,
    parsed_at timestamp with time zone,
    chunked_at timestamp with time zone,
    indexed_at timestamp with time zone,
    created_at_epoch bigint,
    updated_at_epoch bigint,
    CONSTRAINT documents_title_or_filename_chk CHECK (((COALESCE(title, ''::text) <> ''::text) OR (COALESCE(filename, ''::text) <> ''::text)))
);


ALTER TABLE public.documents OWNER TO sylvain;

--
-- Name: ingestion_jobs; Type: TABLE; Schema: public; Owner: sylvain
--

CREATE TABLE public.ingestion_jobs (
    id uuid DEFAULT gen_random_uuid() NOT NULL,
    tenant_id uuid NOT NULL,
    document_id uuid,
    status public.job_status DEFAULT 'queued'::public.job_status NOT NULL,
    params jsonb,
    metrics jsonb,
    worker text,
    logs text,
    error text,
    created_at timestamp with time zone DEFAULT now() NOT NULL,
    started_at timestamp with time zone,
    finished_at timestamp with time zone,
    source jsonb
);


ALTER TABLE public.ingestion_jobs OWNER TO sylvain;

--
-- Name: queries_log; Type: TABLE; Schema: public; Owner: sylvain
--

CREATE TABLE public.queries_log (
    id uuid DEFAULT gen_random_uuid() NOT NULL,
    tenant_id uuid NOT NULL,
    user_email text,
    question text NOT NULL,
    mmr_lambda real,
    k integer,
    fetch_k integer,
    filters jsonb,
    latency_ms integer,
    model text,
    citations jsonb,
    created_at timestamp with time zone DEFAULT now() NOT NULL
);


ALTER TABLE public.queries_log OWNER TO sylvain;

--
-- Name: tenants; Type: TABLE; Schema: public; Owner: sylvain
--

CREATE TABLE public.tenants (
    id uuid DEFAULT gen_random_uuid() NOT NULL,
    slug text NOT NULL,
    name text NOT NULL,
    created_at timestamp with time zone DEFAULT now() NOT NULL
);


ALTER TABLE public.tenants OWNER TO sylvain;

--
-- Name: v_documents_metadata; Type: VIEW; Schema: public; Owner: sylvain
--

CREATE VIEW public.v_documents_metadata AS
 SELECT id AS document_id,
    jsonb_build_object('document_id', (id)::text, 'title', title, 'source_type', (source_type)::text, 'source_app', source_app, 'uri_s3', uri_s3, 'checksum', checksum, 'tenant_id', (tenant_id)::text, 'project', project, 'workspace', workspace, 'notebook_id', notebook_id, 'note_id', note_id, 'sensitivity', sensitivity, 'lang', lang, 'doc_mimetype', mime_type, 'doc_bytes', byte_size, 'doc_pages', page_count, 'doc_version', doc_version, 'created_at_epoch', public.to_epoch_ms(created_at), 'updated_at_epoch', public.to_epoch_ms(updated_at), 'tags', COALESCE(to_jsonb(tags), '[]'::jsonb)) AS metadata_base
   FROM public.documents d;


ALTER VIEW public.v_documents_metadata OWNER TO sylvain;

--
-- Name: v_last_job_per_document; Type: VIEW; Schema: public; Owner: sylvain
--

CREATE VIEW public.v_last_job_per_document AS
 SELECT DISTINCT ON (document_id) document_id,
    id AS ingestion_job_id,
    status,
    created_at
   FROM public.ingestion_jobs j
  WHERE (document_id IS NOT NULL)
  ORDER BY document_id, created_at DESC;


ALTER VIEW public.v_last_job_per_document OWNER TO sylvain;

--
-- Name: v_last_queries; Type: VIEW; Schema: public; Owner: sylvain
--

CREATE VIEW public.v_last_queries AS
 SELECT id,
    created_at,
    user_email,
    question,
    k,
    fetch_k,
    mmr_lambda,
    latency_ms,
    jsonb_array_length(COALESCE(citations, '[]'::jsonb)) AS retrieved_count
   FROM public.queries_log q
  WHERE (created_at >= (now() - '7 days'::interval));


ALTER VIEW public.v_last_queries OWNER TO sylvain;

--
-- Name: api_keys api_keys_pkey; Type: CONSTRAINT; Schema: public; Owner: sylvain
--

ALTER TABLE ONLY public.api_keys
    ADD CONSTRAINT api_keys_pkey PRIMARY KEY (id);


--
-- Name: api_keys api_keys_tenant_id_provider_label_key; Type: CONSTRAINT; Schema: public; Owner: sylvain
--

ALTER TABLE ONLY public.api_keys
    ADD CONSTRAINT api_keys_tenant_id_provider_label_key UNIQUE (tenant_id, provider, label);


--
-- Name: artifacts artifacts_pkey; Type: CONSTRAINT; Schema: public; Owner: sylvain
--

ALTER TABLE ONLY public.artifacts
    ADD CONSTRAINT artifacts_pkey PRIMARY KEY (id);


--
-- Name: documents documents_pkey; Type: CONSTRAINT; Schema: public; Owner: sylvain
--

ALTER TABLE ONLY public.documents
    ADD CONSTRAINT documents_pkey PRIMARY KEY (id);


--
-- Name: documents documents_sha256_key; Type: CONSTRAINT; Schema: public; Owner: sylvain
--

ALTER TABLE ONLY public.documents
    ADD CONSTRAINT documents_sha256_key UNIQUE (checksum);


--
-- Name: ingestion_jobs ingestion_jobs_pkey; Type: CONSTRAINT; Schema: public; Owner: sylvain
--

ALTER TABLE ONLY public.ingestion_jobs
    ADD CONSTRAINT ingestion_jobs_pkey PRIMARY KEY (id);


--
-- Name: queries_log queries_log_pkey; Type: CONSTRAINT; Schema: public; Owner: sylvain
--

ALTER TABLE ONLY public.queries_log
    ADD CONSTRAINT queries_log_pkey PRIMARY KEY (id);


--
-- Name: tenants tenants_pkey; Type: CONSTRAINT; Schema: public; Owner: sylvain
--

ALTER TABLE ONLY public.tenants
    ADD CONSTRAINT tenants_pkey PRIMARY KEY (id);


--
-- Name: tenants tenants_slug_key; Type: CONSTRAINT; Schema: public; Owner: sylvain
--

ALTER TABLE ONLY public.tenants
    ADD CONSTRAINT tenants_slug_key UNIQUE (slug);


--
-- Name: idx_api_keys_tenant; Type: INDEX; Schema: public; Owner: sylvain
--

CREATE INDEX idx_api_keys_tenant ON public.api_keys USING btree (tenant_id);


--
-- Name: idx_artifacts_doc; Type: INDEX; Schema: public; Owner: sylvain
--

CREATE INDEX idx_artifacts_doc ON public.artifacts USING btree (document_id);


--
-- Name: idx_artifacts_kind; Type: INDEX; Schema: public; Owner: sylvain
--

CREATE INDEX idx_artifacts_kind ON public.artifacts USING btree (kind);


--
-- Name: idx_artifacts_meta_gin; Type: INDEX; Schema: public; Owner: sylvain
--

CREATE INDEX idx_artifacts_meta_gin ON public.artifacts USING gin (extra jsonb_path_ops);


--
-- Name: idx_documents_project; Type: INDEX; Schema: public; Owner: sylvain
--

CREATE INDEX idx_documents_project ON public.documents USING btree (project);


--
-- Name: idx_documents_sensitivity; Type: INDEX; Schema: public; Owner: sylvain
--

CREATE INDEX idx_documents_sensitivity ON public.documents USING btree (sensitivity);


--
-- Name: idx_documents_source_app; Type: INDEX; Schema: public; Owner: sylvain
--

CREATE INDEX idx_documents_source_app ON public.documents USING btree (source_app);


--
-- Name: idx_documents_status; Type: INDEX; Schema: public; Owner: sylvain
--

CREATE INDEX idx_documents_status ON public.documents USING btree (status);


--
-- Name: idx_documents_tenant; Type: INDEX; Schema: public; Owner: sylvain
--

CREATE INDEX idx_documents_tenant ON public.documents USING btree (tenant_id);


--
-- Name: idx_documents_title_trgm; Type: INDEX; Schema: public; Owner: sylvain
--

CREATE INDEX idx_documents_title_trgm ON public.documents USING gin (title public.gin_trgm_ops);


--
-- Name: idx_documents_workspace; Type: INDEX; Schema: public; Owner: sylvain
--

CREATE INDEX idx_documents_workspace ON public.documents USING btree (workspace);


--
-- Name: idx_jobs_doc; Type: INDEX; Schema: public; Owner: sylvain
--

CREATE INDEX idx_jobs_doc ON public.ingestion_jobs USING btree (document_id);


--
-- Name: idx_jobs_status; Type: INDEX; Schema: public; Owner: sylvain
--

CREATE INDEX idx_jobs_status ON public.ingestion_jobs USING btree (status);


--
-- Name: idx_jobs_tenant; Type: INDEX; Schema: public; Owner: sylvain
--

CREATE INDEX idx_jobs_tenant ON public.ingestion_jobs USING btree (tenant_id);


--
-- Name: idx_qlog_citations_gin; Type: INDEX; Schema: public; Owner: sylvain
--

CREATE INDEX idx_qlog_citations_gin ON public.queries_log USING gin (citations jsonb_path_ops);


--
-- Name: idx_qlog_created; Type: INDEX; Schema: public; Owner: sylvain
--

CREATE INDEX idx_qlog_created ON public.queries_log USING btree (created_at);


--
-- Name: idx_qlog_tenant; Type: INDEX; Schema: public; Owner: sylvain
--

CREATE INDEX idx_qlog_tenant ON public.queries_log USING btree (tenant_id);


--
-- Name: documents trg_documents_updated; Type: TRIGGER; Schema: public; Owner: sylvain
--

CREATE TRIGGER trg_documents_updated BEFORE UPDATE ON public.documents FOR EACH ROW EXECUTE FUNCTION public.set_updated_at();


--
-- Name: api_keys api_keys_tenant_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: sylvain
--

ALTER TABLE ONLY public.api_keys
    ADD CONSTRAINT api_keys_tenant_id_fkey FOREIGN KEY (tenant_id) REFERENCES public.tenants(id) ON DELETE CASCADE;


--
-- Name: artifacts artifacts_document_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: sylvain
--

ALTER TABLE ONLY public.artifacts
    ADD CONSTRAINT artifacts_document_id_fkey FOREIGN KEY (document_id) REFERENCES public.documents(id) ON DELETE CASCADE;


--
-- Name: documents documents_tenant_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: sylvain
--

ALTER TABLE ONLY public.documents
    ADD CONSTRAINT documents_tenant_id_fkey FOREIGN KEY (tenant_id) REFERENCES public.tenants(id) ON DELETE CASCADE;


--
-- Name: ingestion_jobs ingestion_jobs_document_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: sylvain
--

ALTER TABLE ONLY public.ingestion_jobs
    ADD CONSTRAINT ingestion_jobs_document_id_fkey FOREIGN KEY (document_id) REFERENCES public.documents(id) ON DELETE SET NULL;


--
-- Name: ingestion_jobs ingestion_jobs_tenant_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: sylvain
--

ALTER TABLE ONLY public.ingestion_jobs
    ADD CONSTRAINT ingestion_jobs_tenant_id_fkey FOREIGN KEY (tenant_id) REFERENCES public.tenants(id) ON DELETE CASCADE;


--
-- Name: queries_log queries_log_tenant_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: sylvain
--

ALTER TABLE ONLY public.queries_log
    ADD CONSTRAINT queries_log_tenant_id_fkey FOREIGN KEY (tenant_id) REFERENCES public.tenants(id) ON DELETE CASCADE;


--
-- PostgreSQL database dump complete
--

2. Qdrant Schema

This describes the configuration of the primary vector collection.

Collection Name: aurora_documents

{
  "params": {
    "vectors": {
      "size": 3072,
      "distance": "Cosine"
    }
  },
  "payload_schema": {}
}

Note: The payload_schema is currently empty, meaning Qdrant infers the data types of the metadata fields dynamically. For production stability, defining explicit payload indexes is recommended.

3. MinIO Object Storage

MinIO is used for storing raw files and processed artifacts. The structure is organized into the following buckets:

aurorarag-artifacts: Stores the results of the ingestion process, such as JSON or Markdown representations of the original documents.
aurorarag-uploads: A temporary staging area for files uploaded by users before they are processed.
aurorarag-backups: Reserved for database backups and other critical data snapshots.

Addendum (janvier 2026) — conversations multi-tours (follow-ups)

1) thread_state (état minimal par conversation)

2) queries_log (observabilité follow-up)

1. PostgreSQL Schema

2. Qdrant Schema

3. MinIO Object Storage

On this page

1) `thread_state` (état minimal par conversation)

2) `queries_log` (observabilité follow-up)