From 0bda025549b4917bab51954a44d4af60002959db Mon Sep 17 00:00:00 2001 From: Trenton Holmes <797416+stumpylog@users.noreply.github.com> Date: Fri, 19 Jun 2026 15:19:41 -0700 Subject: [PATCH] Reject malformed vector column types with exact-match parsing The vec0 column-type parser matched element type names with a prefix-only `sqlite3_strnicmp` (e.g. comparing the first 5 bytes against "float"). Any identifier sharing a prefix with a real type was silently coerced to that type: `float16[768]` became a 32-bit `float` column, `bitcoin[2]` became `bit`, and typos like `floaty` were accepted instead of erroring. Compare the full identifier length so only exact element-type spellings parse. `float32` is added as an explicit alias since it was previously accepted via the `float` prefix and is a natural spelling to keep working. This also unblocks adding real `float16`/`bfloat16` types (#27), which would otherwise collide with the `float` prefix. Co-Authored-By: Claude Opus 4.8 --- sqlite-vec.c | 16 ++++++++---- tests/test-column-type-parse.py | 44 +++++++++++++++++++++++++++++++++ tests/test-loadable.py | 4 +-- 3 files changed, 57 insertions(+), 7 deletions(-) create mode 100644 tests/test-column-type-parse.py diff --git a/sqlite-vec.c b/sqlite-vec.c index 7af3b6a..25de387 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -3012,13 +3012,19 @@ int vec0_parse_vector_column(const char *source, int source_length, token.token_type != TOKEN_TYPE_IDENTIFIER) { return SQLITE_EMPTY; } - if (sqlite3_strnicmp(token.start, "float", 5) == 0 || - sqlite3_strnicmp(token.start, "f32", 3) == 0) { + // Match the full identifier, not just a prefix: `sqlite3_strnicmp` only + // compares the given number of bytes, so a bare prefix check would coerce + // typos and lookalikes (e.g. `float16`, `bitcoin`) to a real type instead of + // rejecting them. + const int typeLength = token.end - token.start; + if ((typeLength == 5 && sqlite3_strnicmp(token.start, "float", 5) == 0) || + (typeLength == 7 && sqlite3_strnicmp(token.start, "float32", 7) == 0) || + (typeLength == 3 && sqlite3_strnicmp(token.start, "f32", 3) == 0)) { elementType = SQLITE_VEC_ELEMENT_TYPE_FLOAT32; - } else if (sqlite3_strnicmp(token.start, "int8", 4) == 0 || - sqlite3_strnicmp(token.start, "i8", 2) == 0) { + } else if ((typeLength == 4 && sqlite3_strnicmp(token.start, "int8", 4) == 0) || + (typeLength == 2 && sqlite3_strnicmp(token.start, "i8", 2) == 0)) { elementType = SQLITE_VEC_ELEMENT_TYPE_INT8; - } else if (sqlite3_strnicmp(token.start, "bit", 3) == 0) { + } else if (typeLength == 3 && sqlite3_strnicmp(token.start, "bit", 3) == 0) { elementType = SQLITE_VEC_ELEMENT_TYPE_BIT; } else { return SQLITE_EMPTY; diff --git a/tests/test-column-type-parse.py b/tests/test-column-type-parse.py new file mode 100644 index 0000000..4ad53d0 --- /dev/null +++ b/tests/test-column-type-parse.py @@ -0,0 +1,44 @@ +import sqlite3 + +import pytest + +# Element-type spellings that vec0 must accept in a vector column definition. +# `float32` is undocumented but has always been accepted (it prefix-matched +# "float"), so it stays supported to avoid a silent regression. +VALID_TYPE_DEFS = [ + "float[2]", + "f32[2]", + "float32[2]", + "int8[2]", + "i8[2]", + "bit[8]", +] + +# Malformed type names that merely share a prefix with a valid element type. +# vec0 used a prefix-only strnicmp match and silently coerced these to the +# prefix's type (e.g. `float16` -> float32, `bitcoin` -> bit). That hides typos +# and would silently shadow real future types like float16/bfloat16, so the +# parser must reject any identifier that is not an exact element-type spelling. +INVALID_TYPE_DEFS = [ + "floaty[2]", + "floating[2]", + "float16[2]", + "f32x[2]", + "int8_t[2]", + "int8garbage[2]", + "i8x[2]", + "bitcoin[2]", + "bits[2]", + "bfloat16[2]", +] + + +@pytest.mark.parametrize("type_def", VALID_TYPE_DEFS) +def test_valid_vector_column_types_accepted(db, type_def): + db.execute(f"create virtual table t using vec0(a {type_def})") + + +@pytest.mark.parametrize("type_def", INVALID_TYPE_DEFS) +def test_malformed_vector_column_types_rejected(db, type_def): + with pytest.raises(sqlite3.OperationalError): + db.execute(f"create virtual table t using vec0(a {type_def})") diff --git a/tests/test-loadable.py b/tests/test-loadable.py index 0044144..5fed370 100644 --- a/tests/test-loadable.py +++ b/tests/test-loadable.py @@ -1320,7 +1320,7 @@ def test_vec0_text_pk(): create virtual table t using vec0( t_id text primary key, aaa float[1], - bbb float8[1], + bbb float[1], chunk_size=8 ); """ @@ -1437,7 +1437,7 @@ def test_vec0_best_index(): """ create virtual table t using vec0( aaa float[1], - bbb float8[1] + bbb float[1] ); """ )