storageDir = rtrim( $storageDir, '/\\' ); $this->numShards = $numShards; $this->normalizeVectors = $normalizeVectors; if ( ! is_dir( $this->storageDir ) ) { if ( ! mkdir( $this->storageDir, 0777, true ) ) { throw new \RuntimeException( "Failed to create storage directory: {$this->storageDir}" ); } } if ( ! is_writable( $this->storageDir ) ) { throw new \RuntimeException( "Storage directory is not writable: {$this->storageDir}" ); } } private function getShardIndex( string $id ): int { return abs( crc32( $id ) ) % $this->numShards; } private function getShardPath( int $shardIndex ): string { return $this->storageDir . '/' . self::SHARD_PREFIX . $shardIndex . self::SHARD_SUFFIX; } /** * Loads data from a binary shard file. * * @param int $shardIndex * * @return array The data from the shard. * @throws \RuntimeException If the file exists but cannot be read or parsed. */ private function loadShard( int $shardIndex ): array { $filePath = $this->getShardPath( $shardIndex ); if ( ! file_exists( $filePath ) || filesize( $filePath ) === 0 ) { return array(); } $binaryContent = file_get_contents( $filePath ); if ( $binaryContent === false ) { throw new \RuntimeException( "Failed to read shard file: {$filePath}" ); } $items = array(); $offset = 0; $contentLength = strlen( $binaryContent ); $floatSize = ( $this::VECTOR_PACK_FORMAT === 'f' ) ? 4 : 8; while ( $offset < $contentLength ) { // 1. ID Length (uint16) if ( $offset + 2 > $contentLength ) { break; } // Not enough data for ID length $idLenData = unpack( 'n', substr( $binaryContent, $offset, 2 ) ); $idLength = $idLenData[1]; $offset += 2; // 2. ID (string) if ( $offset + $idLength > $contentLength ) { break; } // Not enough data for ID $id = substr( $binaryContent, $offset, $idLength ); $offset += $idLength; // 3. Vector Dimension (uint16) if ( $offset + 2 > $contentLength ) { break; } // Not enough data for dimension $dimData = unpack( 'n', substr( $binaryContent, $offset, 2 ) ); $dimension = $dimData[1]; $offset += 2; // 4. Vector Data (array of floats) $vectorByteLength = $dimension * $floatSize; if ( $offset + $vectorByteLength > $contentLength ) { break; } // Not enough data for vector $vectorBinary = substr( $binaryContent, $offset, $vectorByteLength ); $vector = array_values( unpack( $this::VECTOR_PACK_FORMAT . $dimension, $vectorBinary ) ); // array_values to re-index $offset += $vectorByteLength; // 5. Magnitude (float/double) if ( $offset + $floatSize > $contentLength ) { break; } // Not enough data for magnitude $magData = unpack( $this::MAGNITUDE_PACK_FORMAT . '1val', substr( $binaryContent, $offset, $floatSize ) ); $magnitude = $magData['val']; $offset += $floatSize; // 6. Metadata Type (byte) if ( $offset + 1 > $contentLength ) { break; } // Not enough data for metadata type $metaTypeData = unpack( 'C', substr( $binaryContent, $offset, 1 ) ); $metadataType = $metaTypeData[1]; $offset += 1; $metadata = null; if ( $metadataType === self::METADATA_TYPE_JSON_STRING ) { // 7. Metadata Length (uint32) if ( $offset + 4 > $contentLength ) { break; } // Not enough for metadata length $metaLenData = unpack( 'N', substr( $binaryContent, $offset, 4 ) ); $metadataLength = $metaLenData[1]; $offset += 4; // 8. Metadata (JSON string) if ( $offset + $metadataLength > $contentLength ) { break; } // Not enough for metadata $metadataJson = substr( $binaryContent, $offset, $metadataLength ); $offset += $metadataLength; $metadata = json_decode( $metadataJson, true ); if ( json_last_error() !== JSON_ERROR_NONE ) { trigger_error( "Failed to decode metadata JSON for ID '{$id}' in shard {$shardIndex}: " . json_last_error_msg(), E_USER_WARNING ); $metadata = array( '_parsing_error' => json_last_error_msg() ); // Store error instead of failing all } } $items[] = array( 'id' => $id, 'vector' => $vector, 'dimension' => $dimension, // Store dimension for consistency checks 'magnitude' => $magnitude, 'metadata' => $metadata, ); } if ( $offset !== $contentLength && $contentLength > 0 ) { trigger_error( "Shard file {$filePath} may be corrupt or incompletely read. Read {$offset} of {$contentLength} bytes.", E_USER_WARNING ); } return $items; } /** * Saves data to a binary shard file using transactional writes. * * @param int $shardIndex * @param array $shardData * * @throws \RuntimeException If saving fails. */ private function saveShard( int $shardIndex, array $shardData ): void { $shardPath = $this->getShardPath( $shardIndex ); $tempPath = $this->storageDir . '/' . self::SHARD_PREFIX . $shardIndex . uniqid( '_temp_', true ) . self::TEMP_SUFFIX; $binaryOutput = ''; foreach ( $shardData as $item ) { // 1. ID $id = (string) ( $item['id'] ?? '' ); $binaryOutput .= pack( 'n', strlen( $id ) ); // ID Length (uint16) $binaryOutput .= $id; // ID (string) // 2. Vector $vector = $item['vector'] ?? array(); $dimension = count( $vector ); $binaryOutput .= pack( 'n', $dimension ); // Vector Dimension (uint16) if ( $dimension > 0 ) { // The '*' in pack format string for arrays applies to remaining arguments // So we need to pass vector elements as separate arguments using '...' (splat operator) $binaryOutput .= pack( $this::VECTOR_PACK_FORMAT . '*', ...$vector ); // Vector data } // 3. Magnitude $magnitude = (float) ( $item['magnitude'] ?? 0.0 ); $binaryOutput .= pack( $this::MAGNITUDE_PACK_FORMAT, $magnitude ); // Magnitude (float/double) // 4. Metadata $metadata = $item['metadata'] ?? null; if ( $metadata === null ) { $binaryOutput .= pack( 'C', self::METADATA_TYPE_NULL ); // Metadata Type (byte) } else { $binaryOutput .= pack( 'C', self::METADATA_TYPE_JSON_STRING ); // Metadata Type (byte) $metadataJson = json_encode( $metadata ); if ( $metadataJson === false ) { // Should not happen if metadata is valid PHP trigger_error( "Failed to JSON encode metadata for ID '{$id}': " . json_last_error_msg(), E_USER_WARNING ); $metadataJson = '{"_encoding_error":"' . json_last_error_msg() . '"}'; } $binaryOutput .= pack( 'N', strlen( $metadataJson ) ); // Metadata Length (uint32) $binaryOutput .= $metadataJson; // Metadata (JSON string) } } if ( file_put_contents( $tempPath, $binaryOutput ) === false ) { if ( file_exists( $tempPath ) ) { @unlink( $tempPath ); } throw new \RuntimeException( "Failed to write to temporary shard file: {$tempPath}" ); } if ( ! rename( $tempPath, $shardPath ) ) { if ( file_exists( $tempPath ) ) { @unlink( $tempPath ); } throw new \RuntimeException( "Failed to rename temporary shard file '{$tempPath}' to '{$shardPath}'." ); } } public function addVector( string $id, array $vector, $metadata = null ): bool { // Basic validation if ( empty( $vector ) || ! is_array( $vector ) ) { /* ... error checks ... */ return false; } foreach ( $vector as $val ) { if ( ! is_numeric( $val ) ) { /* ... error checks ... */ return false; } } $shardIndex = $this->getShardIndex( $id ); $shardData = $this->loadShard( $shardIndex ); foreach ( $shardData as $item ) { if ( $item['id'] === $id ) { return false; } // ID already exists } $magnitude = $this->magnitude( $vector ); $vectorToStore = $vector; $magnitudeToStore = $magnitude; $dimension = count( $vector ); if ( $this->normalizeVectors ) { if ( $magnitude > self::FLOAT_EPSILON ) { $vectorToStore = $this->normalizeVector( $vector, $magnitude ); $magnitudeToStore = 1.0; } else { $magnitudeToStore = 0.0; } } $shardData[] = array( 'id' => $id, 'vector' => $vectorToStore, 'dimension' => $dimension, 'magnitude' => $magnitudeToStore, 'metadata' => $metadata, ); $this->saveShard( $shardIndex, $shardData ); return true; } /** * Adds multiple vectors in a batch. More efficient than single adds for large amounts. * * @param array $vectorsData Array of ['id' => ..., 'vector' => ..., 'metadata' => ...] * * @return array ['succeeded' => count, 'failed_ids' => [...], 'duplicate_ids' => [...]] */ public function addVectorsBatch( array $vectorsData ): array { $results = array( 'succeeded' => 0, 'failed_ids' => array(), 'duplicate_ids' => array() ); $vectorsByShard = array(); // Group vectors by shard foreach ( $vectorsData as $vData ) { if ( ! isset( $vData['id'], $vData['vector'] ) || empty( $vData['vector'] ) || ! is_array( $vData['vector'] ) ) { $results['failed_ids'][] = $vData['id'] ?? 'unknown_id_missing_vector'; continue; } // Basic numeric check for vector elements $validVector = true; foreach ( $vData['vector'] as $val ) { if ( ! is_numeric( $val ) ) { $validVector = false; break; } } if ( ! $validVector ) { $results['failed_ids'][] = $vData['id']; continue; } $shardIndex = $this->getShardIndex( $vData['id'] ); $vectorsByShard[ $shardIndex ][] = $vData; } // Process each shard foreach ( $vectorsByShard as $shardIndex => $shardVectors ) { $currentShardData = $this->loadShard( $shardIndex ); $existingIdsInShard = array_column( $currentShardData, 'id' ); $vectorsToAddThisShard = array(); foreach ( $shardVectors as $vData ) { if ( in_array( $vData['id'], $existingIdsInShard ) ) { $results['duplicate_ids'][] = $vData['id']; continue; } $vector = $vData['vector']; $metadata = $vData['metadata'] ?? null; $id = $vData['id']; $magnitude = $this->magnitude( $vector ); $vectorToStore = $vector; $magnitudeToStore = $magnitude; $dimension = count( $vector ); if ( $this->normalizeVectors ) { if ( $magnitude > self::FLOAT_EPSILON ) { $vectorToStore = $this->normalizeVector( $vector, $magnitude ); $magnitudeToStore = 1.0; } else { $magnitudeToStore = 0.0; } } $vectorsToAddThisShard[] = array( 'id' => $id, 'vector' => $vectorToStore, 'dimension' => $dimension, 'magnitude' => $magnitudeToStore, 'metadata' => $metadata, ); $existingIdsInShard[] = $id; // Prevent adding same ID twice in one batch } if ( ! empty( $vectorsToAddThisShard ) ) { $newShardData = array_merge( $currentShardData, $vectorsToAddThisShard ); try { $this->saveShard( $shardIndex, $newShardData ); $results['succeeded'] += count( $vectorsToAddThisShard ); } catch ( \Exception $e ) { trigger_error( "Batch add failed for shard {$shardIndex}: " . $e->getMessage(), E_USER_WARNING ); foreach ( $vectorsToAddThisShard as $failedVec ) { $results['failed_ids'][] = $failedVec['id']; } } } } return $results; } public function removeVector( string $id ): bool { /* ... Same logic as before, relies on load/saveShard ... */ $shardIndex = $this->getShardIndex( $id ); $shardData = $this->loadShard( $shardIndex ); $initialCount = count( $shardData ); $shardData = array_filter( $shardData, fn( $item ) => $item['id'] !== $id ); if ( count( $shardData ) < $initialCount ) { $this->saveShard( $shardIndex, $shardData ); return true; } return false; } public function getVectorById( string $id ): ?array { /* ... Same logic as before ... */ $shardIndex = $this->getShardIndex( $id ); $shardData = $this->loadShard( $shardIndex ); foreach ( $shardData as $item ) { if ( $item['id'] === $id ) { return $item; } } return null; } public function findSimilar( array $queryVector, int $topK = 5, $metadataFilter = null ): array { // ... (Validate queryVector) ... if ( empty( $queryVector ) || ! is_array( $queryVector ) ) { return array(); } foreach ( $queryVector as $val ) { if ( ! is_numeric( $val ) ) { return array(); } } $allResults = array(); $queryMagnitude = $this->magnitude( $queryVector ); if ( $queryMagnitude < self::FLOAT_EPSILON ) { return array(); } $normalizedQueryVector = $this->normalizeVector( $queryVector, $queryMagnitude ); $queryDimension = count( $queryVector ); $filterFn = null; /* ... (Metadata filter setup from previous version) ... */ if ( is_callable( $metadataFilter ) ) { $filterFn = $metadataFilter; } elseif ( is_array( $metadataFilter ) && ! empty( $metadataFilter ) ) { $filterFn = function ( $metadata ) use ( $metadataFilter ): bool { if ( ! is_array( $metadata ) ) { return false; } foreach ( $metadataFilter as $key => $value ) { if ( ! isset( $metadata[ $key ] ) || $metadata[ $key ] !== $value ) { return false; } } return true; }; } for ( $i = 0; $i < $this->numShards; $i ++ ) { $shardData = $this->loadShard( $i ); // Now loads binary data if ( empty( $shardData ) ) { continue; } foreach ( $shardData as $item ) { // Basic validation from loaded binary data if ( ! isset( $item['id'], $item['vector'], $item['magnitude'], $item['dimension'] ) || ! is_array( $item['vector'] ) ) { trigger_error( "Skipping invalid item in shard {$i} after binary load.", E_USER_WARNING ); continue; } if ( $filterFn !== null ) { /* ... (Metadata filtering) ... */ if ( ! $filterFn( $item['metadata'] ?? null ) ) { continue; } } if ( $item['dimension'] !== $queryDimension ) { /* ... (Dimension check) ... */ trigger_error( "Dimension mismatch ID '{$item['id']}' (stored {$item['dimension']}, query {$queryDimension}). Skipping.", E_USER_WARNING ); continue; } $storedMagnitude = (float) $item['magnitude']; $storedVector = $item['vector']; if ( $storedMagnitude < self::FLOAT_EPSILON ) { continue; } $similarity = 0.0; if ( abs( $storedMagnitude - 1.0 ) < self::FLOAT_EPSILON && $this->normalizeVectors ) { $similarity = $this->dotProduct( $normalizedQueryVector, $storedVector ); } else { $dot = $this->dotProduct( $normalizedQueryVector, $storedVector ); $similarity = $dot / $storedMagnitude; } $similarity = max( - 1.0, min( 1.0, $similarity ) ); $allResults[] = array( 'id' => $item['id'], 'metadata' => $item['metadata'] ?? null, 'score' => $similarity, ); } unset( $shardData ); } usort( $allResults, fn( $a, $b ) => $b['score'] <=> $a['score'] ); return array_slice( $allResults, 0, $topK ); } // --- Helper Functions (normalizeVector, dotProduct, magnitude) - same as before --- private function normalizeVector( array $vec, ?float $magnitude = null ): array { /* ... */ if ( $magnitude === null ) { $magnitude = $this->magnitude( $vec ); } if ( $magnitude < self::FLOAT_EPSILON ) { return array_fill( 0, count( $vec ), 0.0 ); } $normalized = array(); foreach ( $vec as $value ) { $normalized[] = (float) $value / $magnitude; } return $normalized; } private function dotProduct( array $vec1, array $vec2 ): float { /* ... */ $result = 0.0; $count = count( $vec1 ); for ( $i = 0; $i < $count; $i ++ ) { if ( isset( $vec1[ $i ], $vec2[ $i ] ) && is_numeric( $vec1[ $i ] ) && is_numeric( $vec2[ $i ] ) ) { $result += (float) $vec1[ $i ] * (float) $vec2[ $i ]; } else { return NAN; } } return $result; } private function magnitude( array $vec ): float { /* ... */ $sumOfSquares = 0.0; foreach ( $vec as $value ) { if ( is_numeric( $value ) ) { $sumOfSquares += (float) $value * (float) $value; } else { return NAN; } } return sqrt( max( 0.0, $sumOfSquares ) ); } // --- Management Functions (getAllVectors, clearAll) --- public function getAllVectors(): array { /* ... Same as before, relies on loadShard ... */ $allData = array(); for ( $i = 0; $i < $this->numShards; $i ++ ) { $shardData = $this->loadShard( $i ); if ( ! empty( $shardData ) ) { foreach ( $shardData as $item ) { $allData[] = $item; } } unset( $shardData ); } return $allData; } public function clearAll(): bool { /* ... Same as before, but be careful with temp files if any exist ... */ $success = true; for ( $i = 0; $i < $this->numShards; $i ++ ) { $filePath = $this->getShardPath( $i ); if ( file_exists( $filePath ) ) { if ( ! unlink( $filePath ) ) { trigger_error( "Failed to delete shard file: {$filePath}", E_USER_WARNING ); $success = false; } } $tempPattern = $this->storageDir . '/' . self::SHARD_PREFIX . $i . '_temp_*' . self::TEMP_SUFFIX; foreach ( glob( $tempPattern ) as $tempFile ) { @unlink( $tempFile ); } } if ( ! is_dir( $this->storageDir ) ) { @mkdir( $this->storageDir, 0777, true ); } return $success; } } // --- Example Usage --- $storageDirectory = './vector_store_binary_shards'; $numberOfShards = 3; $normalizeOnAdd = true; $vectorStore = new SimpleShardedVectorStoreBinary( $storageDirectory, $numberOfShards, $normalizeOnAdd ); echo "Using Binary Store. Normalization on Add: " . ( $normalizeOnAdd ? 'Enabled' : 'Disabled' ) . "\n"; // Optional: Clear previous data $vectorStore->clearAll(); echo "Store cleared.\n"; $vectorsToAdd = array( array( 'id' => 'cat1_bin', 'vector' => array( 0.1, 0.8, 0.1, 0.5 ), 'metadata' => array( 'topic' => 'animal', 'source' => 'A' ) ), array( 'id' => 'dog1_bin', 'vector' => array( 0.9, 0.1, 0.1, 0.2 ), 'metadata' => array( 'topic' => 'animal', 'source' => 'B' ) ), array( 'id' => 'sky1_bin', 'vector' => array( 0.1, 0.1, 0.9, 0.8 ), 'metadata' => array( 'topic' => 'weather', 'source' => 'C' ) ), ); $vectorsToAddMore = array( // For batch add array( 'id' => 'cat2_bin', 'vector' => array( 0.2, 0.7, 0.0, 0.6 ), 'metadata' => array( 'topic' => 'animal', 'source' => 'A' ) ), array( 'id' => 'dog2_bin', 'vector' => array( 0.8, 0.2, 0.0, 0.3 ), 'metadata' => array( 'topic' => 'animal', 'source' => 'D' ) ), // new source array( 'id' => 'sky2_bin', 'vector' => array( 0.0, 0.2, 0.8, 0.7 ), 'metadata' => array( 'topic' => 'weather', 'source' => 'A' ) ), array( 'id' => 'dup_cat1_bin', 'vector' => array( 0.1, 0.1, 0.1, 0.1 ), 'metadata' => array( 'topic' => 'duplicate_test' ) ), // this ID will be a duplicate of cat1_bin if cat1_bin is added first ); echo "\nAdding vectors individually...\n"; foreach ( $vectorsToAdd as $v ) { if ( $vectorStore->addVector( $v['id'], $v['vector'], $v['metadata'] ) ) { echo " - Added ID: {$v['id']}\n"; } else { echo " - Failed or duplicate ID: {$v['id']}\n"; } } echo "\nAdding vectors in batch...\n"; // Adjust vectorsToAddMore to avoid duplicates if running sequentially // For this test, let's assume the first batch has unique IDs relative to the second. $batchResult = $vectorStore->addVectorsBatch( $vectorsToAddMore ); echo "Batch Add Results:\n"; print_r( $batchResult ); // Test search $queryVector = array( 0.15, 0.75, 0.05, 0.55 ); // 4D query echo "\nSearching for vectors similar to: [" . implode( ', ', $queryVector ) . "]\n"; $similar = $vectorStore->findSimilar( $queryVector, 3 ); print_r( $similar ); // Test get by ID $item = $vectorStore->getVectorById( 'dog1_bin' ); echo "\nRetrieved 'dog1_bin':\n"; print_r( $item ); echo "\nAll vectors (use with caution on large stores):\n"; // print_r($vectorStore->getAllVectors()); // Can be very verbose echo "\nDone.\n";