diff --git a/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs b/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs index 7ac7968c8f..198b94df49 100644 --- a/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs +++ b/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs @@ -29,24 +29,24 @@ impl CudaServerKey { /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS; /// /// let gpu_index = 0; - /// let mut stream = CudaStreams::new_single_gpu(gpu_index); + /// let mut streams = CudaStreams::new_single_gpu(gpu_index); /// /// // We have 4 * 2 = 8 bits of message /// let size = 4; - /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream); + /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams); /// /// let msg = 1u64; /// /// let ct = cks.encrypt(msg); /// /// // Copy to GPU - /// let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &mut stream); + /// let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &mut streams); /// /// // Compute homomorphically a bitwise and: - /// let d_ct_res = sks.unchecked_bitnot(&d_ct, &mut stream); + /// let d_ct_res = sks.unchecked_bitnot(&d_ct, &mut streams); /// /// // Copy back to CPU - /// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream); + /// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams); /// /// // Decrypt: /// let dec: u64 = cks.decrypt(&ct_res); @@ -55,24 +55,24 @@ impl CudaServerKey { pub fn unchecked_bitnot( &self, ct: &T, - stream: &CudaStreams, + streams: &CudaStreams, ) -> T { - let mut result = unsafe { ct.duplicate_async(stream) }; - self.unchecked_bitnot_assign(&mut result, stream); + let mut result = unsafe { ct.duplicate_async(streams) }; + self.unchecked_bitnot_assign(&mut result, streams); result } /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn unchecked_bitnot_assign_async( &self, ct: &mut T, - stream: &CudaStreams, + streams: &CudaStreams, ) { // We do (-ciphertext) + (msg_mod -1) as it allows to avoid an allocation - cuda_lwe_ciphertext_negate_assign(&mut ct.as_mut().d_blocks, stream); + cuda_lwe_ciphertext_negate_assign(&mut ct.as_mut().d_blocks, streams); let ct_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0; @@ -81,14 +81,21 @@ impl CudaServerKey { let shift_plaintext = u64::from(scalar) * delta; let scalar_vector = vec![shift_plaintext; ct_blocks]; - let mut d_decomposed_scalar = - CudaVec::::new_async(ct.as_ref().d_blocks.lwe_ciphertext_count().0, stream, 0); - d_decomposed_scalar.copy_from_cpu_async(scalar_vector.as_slice(), stream, 0); + let mut d_decomposed_scalar = CudaVec::::new_async( + ct.as_ref().d_blocks.lwe_ciphertext_count().0, + streams, + streams.gpu_indexes[0], + ); + d_decomposed_scalar.copy_from_cpu_async( + scalar_vector.as_slice(), + streams, + streams.gpu_indexes[0], + ); cuda_lwe_ciphertext_plaintext_add_assign( &mut ct.as_mut().d_blocks, &d_decomposed_scalar, - stream, + streams, ); ct.as_mut().info = ct.as_ref().info.after_bitnot(); } @@ -96,12 +103,12 @@ impl CudaServerKey { pub fn unchecked_bitnot_assign( &self, ct: &mut T, - stream: &CudaStreams, + streams: &CudaStreams, ) { unsafe { - self.unchecked_bitnot_assign_async(ct, stream); + self.unchecked_bitnot_assign_async(ct, streams); } - stream.synchronize(); + streams.synchronize(); } /// Computes homomorphically bitand between two ciphertexts encrypting integer values. @@ -121,11 +128,11 @@ impl CudaServerKey { /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS; /// /// let gpu_index = 0; - /// let mut stream = CudaStreams::new_single_gpu(gpu_index); + /// let mut streams = CudaStreams::new_single_gpu(gpu_index); /// /// // We have 4 * 2 = 8 bits of message /// let size = 4; - /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream); + /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams); /// /// let msg1 = 201u64; /// let msg2 = 1u64; @@ -134,14 +141,14 @@ impl CudaServerKey { /// let ct2 = cks.encrypt(msg2); /// /// // Copy to GPU - /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut stream); - /// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut stream); + /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut streams); + /// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut streams); /// /// // Compute homomorphically a bitwise and: - /// let d_ct_res = sks.unchecked_bitand(&d_ct1, &d_ct2, &mut stream); + /// let d_ct_res = sks.unchecked_bitand(&d_ct1, &d_ct2, &mut streams); /// /// // Copy back to CPU - /// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream); + /// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams); /// /// // Decrypt: /// let dec: u64 = cks.decrypt(&ct_res); @@ -151,23 +158,23 @@ impl CudaServerKey { &self, ct_left: &T, ct_right: &T, - stream: &CudaStreams, + streams: &CudaStreams, ) -> T { - let mut result = unsafe { ct_left.duplicate_async(stream) }; - self.unchecked_bitand_assign(&mut result, ct_right, stream); + let mut result = unsafe { ct_left.duplicate_async(streams) }; + self.unchecked_bitand_assign(&mut result, ct_right, streams); result } /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn unchecked_bitop_assign_async( &self, ct_left: &mut T, ct_right: &T, op: BitOpType, - stream: &CudaStreams, + streams: &CudaStreams, ) { assert_eq!( ct_left.as_ref().d_blocks.lwe_dimension(), @@ -183,7 +190,7 @@ impl CudaServerKey { match &self.bootstrapping_key { CudaBootstrappingKey::Classic(d_bsk) => { unchecked_bitop_integer_radix_kb_assign_async( - stream, + streams, &mut ct_left.as_mut().d_blocks.0.d_vec, &ct_right.as_ref().d_blocks.0.d_vec, &d_bsk.d_vec, @@ -210,7 +217,7 @@ impl CudaServerKey { } CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { unchecked_bitop_integer_radix_kb_assign_async( - stream, + streams, &mut ct_left.as_mut().d_blocks.0.d_vec, &ct_right.as_ref().d_blocks.0.d_vec, &d_multibit_bsk.d_vec, @@ -242,13 +249,13 @@ impl CudaServerKey { &self, ct_left: &mut T, ct_right: &T, - stream: &CudaStreams, + streams: &CudaStreams, ) { unsafe { - self.unchecked_bitop_assign_async(ct_left, ct_right, BitOpType::And, stream); + self.unchecked_bitop_assign_async(ct_left, ct_right, BitOpType::And, streams); ct_left.as_mut().info = ct_left.as_ref().info.after_bitand(&ct_right.as_ref().info); } - stream.synchronize(); + streams.synchronize(); } /// Computes homomorphically bitor between two ciphertexts encrypting integer values. @@ -268,11 +275,11 @@ impl CudaServerKey { /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS; /// /// let gpu_index = 0; - /// let mut stream = CudaStreams::new_single_gpu(gpu_index); + /// let mut streams = CudaStreams::new_single_gpu(gpu_index); /// /// // We have 4 * 2 = 8 bits of message /// let size = 4; - /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream); + /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams); /// /// let msg1 = 200u64; /// let msg2 = 1u64; @@ -281,14 +288,14 @@ impl CudaServerKey { /// let ct2 = cks.encrypt(msg2); /// /// // Copy to GPU - /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut stream); - /// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut stream); + /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut streams); + /// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut streams); /// /// // Compute homomorphically a bitwise and: - /// let d_ct_res = sks.unchecked_bitor(&d_ct1, &d_ct2, &mut stream); + /// let d_ct_res = sks.unchecked_bitor(&d_ct1, &d_ct2, &mut streams); /// /// // Copy back to CPU - /// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream); + /// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams); /// /// // Decrypt: /// let dec: u64 = cks.decrypt(&ct_res); @@ -298,10 +305,10 @@ impl CudaServerKey { &self, ct_left: &T, ct_right: &T, - stream: &CudaStreams, + streams: &CudaStreams, ) -> T { - let mut result = unsafe { ct_left.duplicate_async(stream) }; - self.unchecked_bitor_assign(&mut result, ct_right, stream); + let mut result = unsafe { ct_left.duplicate_async(streams) }; + self.unchecked_bitor_assign(&mut result, ct_right, streams); result } @@ -309,13 +316,13 @@ impl CudaServerKey { &self, ct_left: &mut T, ct_right: &T, - stream: &CudaStreams, + streams: &CudaStreams, ) { unsafe { - self.unchecked_bitop_assign_async(ct_left, ct_right, BitOpType::Or, stream); + self.unchecked_bitop_assign_async(ct_left, ct_right, BitOpType::Or, streams); ct_left.as_mut().info = ct_left.as_ref().info.after_bitor(&ct_right.as_ref().info); } - stream.synchronize(); + streams.synchronize(); } /// Computes homomorphically bitxor between two ciphertexts encrypting integer values. @@ -335,11 +342,11 @@ impl CudaServerKey { /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS; /// /// let gpu_index = 0; - /// let mut stream = CudaStreams::new_single_gpu(gpu_index); + /// let mut streams = CudaStreams::new_single_gpu(gpu_index); /// /// // We have 4 * 2 = 8 bits of message /// let size = 4; - /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream); + /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams); /// /// let msg1 = 49; /// let msg2 = 64; @@ -348,14 +355,14 @@ impl CudaServerKey { /// let ct2 = cks.encrypt(msg2); /// /// // Copy to GPU - /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut stream); - /// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut stream); + /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut streams); + /// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut streams); /// /// // Compute homomorphically a bitwise and: - /// let d_ct_res = sks.unchecked_bitxor(&d_ct1, &d_ct2, &mut stream); + /// let d_ct_res = sks.unchecked_bitxor(&d_ct1, &d_ct2, &mut streams); /// /// // Copy back to CPU - /// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream); + /// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams); /// /// // Decrypt: /// let dec: u64 = cks.decrypt(&ct_res); @@ -365,10 +372,10 @@ impl CudaServerKey { &self, ct_left: &T, ct_right: &T, - stream: &CudaStreams, + streams: &CudaStreams, ) -> T { - let mut result = unsafe { ct_left.duplicate_async(stream) }; - self.unchecked_bitxor_assign(&mut result, ct_right, stream); + let mut result = unsafe { ct_left.duplicate_async(streams) }; + self.unchecked_bitxor_assign(&mut result, ct_right, streams); result } @@ -376,13 +383,13 @@ impl CudaServerKey { &self, ct_left: &mut T, ct_right: &T, - stream: &CudaStreams, + streams: &CudaStreams, ) { unsafe { - self.unchecked_bitop_assign_async(ct_left, ct_right, BitOpType::Xor, stream); + self.unchecked_bitop_assign_async(ct_left, ct_right, BitOpType::Xor, streams); ct_left.as_mut().info = ct_left.as_ref().info.after_bitxor(&ct_right.as_ref().info); } - stream.synchronize(); + streams.synchronize(); } /// Computes homomorphically bitand between two ciphertexts encrypting integer values. @@ -402,11 +409,11 @@ impl CudaServerKey { /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS; /// /// let gpu_index = 0; - /// let mut stream = CudaStreams::new_single_gpu(gpu_index); + /// let mut streams = CudaStreams::new_single_gpu(gpu_index); /// /// // We have 4 * 2 = 8 bits of message /// let size = 4; - /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream); + /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams); /// /// let msg1 = 201u64; /// let msg2 = 1u64; @@ -415,14 +422,14 @@ impl CudaServerKey { /// let ct2 = cks.encrypt(msg2); /// /// // Copy to GPU - /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut stream); - /// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut stream); + /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut streams); + /// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut streams); /// /// // Compute homomorphically a bitwise and: - /// let d_ct_res = sks.bitand(&d_ct1, &d_ct2, &mut stream); + /// let d_ct_res = sks.bitand(&d_ct1, &d_ct2, &mut streams); /// /// // Copy back to CPU - /// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream); + /// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams); /// /// // Decrypt: /// let dec: u64 = cks.decrypt(&ct_res); @@ -432,22 +439,22 @@ impl CudaServerKey { &self, ct_left: &T, ct_right: &T, - stream: &CudaStreams, + streams: &CudaStreams, ) -> T { - let mut result = unsafe { ct_left.duplicate_async(stream) }; - self.bitand_assign(&mut result, ct_right, stream); + let mut result = unsafe { ct_left.duplicate_async(streams) }; + self.bitand_assign(&mut result, ct_right, streams); result } /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn bitand_assign_async( &self, ct_left: &mut T, ct_right: &T, - stream: &CudaStreams, + streams: &CudaStreams, ) { let mut tmp_rhs; @@ -458,36 +465,36 @@ impl CudaServerKey { ) { (true, true) => (ct_left, ct_right), (true, false) => { - tmp_rhs = ct_right.duplicate_async(stream); - self.full_propagate_assign_async(&mut tmp_rhs, stream); + tmp_rhs = ct_right.duplicate_async(streams); + self.full_propagate_assign_async(&mut tmp_rhs, streams); (ct_left, &tmp_rhs) } (false, true) => { - self.full_propagate_assign_async(ct_left, stream); + self.full_propagate_assign_async(ct_left, streams); (ct_left, ct_right) } (false, false) => { - tmp_rhs = ct_right.duplicate_async(stream); + tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(ct_left, stream); - self.full_propagate_assign_async(&mut tmp_rhs, stream); + self.full_propagate_assign_async(ct_left, streams); + self.full_propagate_assign_async(&mut tmp_rhs, streams); (ct_left, &tmp_rhs) } } }; - self.unchecked_bitop_assign_async(lhs, rhs, BitOpType::And, stream); + self.unchecked_bitop_assign_async(lhs, rhs, BitOpType::And, streams); } pub fn bitand_assign( &self, ct_left: &mut T, ct_right: &T, - stream: &CudaStreams, + streams: &CudaStreams, ) { unsafe { - self.bitand_assign_async(ct_left, ct_right, stream); + self.bitand_assign_async(ct_left, ct_right, streams); } - stream.synchronize(); + streams.synchronize(); } /// Computes homomorphically bitor between two ciphertexts encrypting integer values. @@ -507,11 +514,11 @@ impl CudaServerKey { /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS; /// /// let gpu_index = 0; - /// let mut stream = CudaStreams::new_single_gpu(gpu_index); + /// let mut streams = CudaStreams::new_single_gpu(gpu_index); /// /// // We have 4 * 2 = 8 bits of message /// let size = 4; - /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream); + /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams); /// /// let msg1 = 201u64; /// let msg2 = 1u64; @@ -520,14 +527,14 @@ impl CudaServerKey { /// let ct2 = cks.encrypt(msg2); /// /// // Copy to GPU - /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut stream); - /// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut stream); + /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut streams); + /// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut streams); /// /// // Compute homomorphically a bitwise and: - /// let d_ct_res = sks.bitor(&d_ct1, &d_ct2, &mut stream); + /// let d_ct_res = sks.bitor(&d_ct1, &d_ct2, &mut streams); /// /// // Copy back to CPU - /// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream); + /// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams); /// /// // Decrypt: /// let dec: u64 = cks.decrypt(&ct_res); @@ -537,22 +544,22 @@ impl CudaServerKey { &self, ct_left: &T, ct_right: &T, - stream: &CudaStreams, + streams: &CudaStreams, ) -> T { - let mut result = unsafe { ct_left.duplicate_async(stream) }; - self.bitor_assign(&mut result, ct_right, stream); + let mut result = unsafe { ct_left.duplicate_async(streams) }; + self.bitor_assign(&mut result, ct_right, streams); result } /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn bitor_assign_async( &self, ct_left: &mut T, ct_right: &T, - stream: &CudaStreams, + streams: &CudaStreams, ) { let mut tmp_rhs; @@ -562,36 +569,36 @@ impl CudaServerKey { ) { (true, true) => (ct_left, ct_right), (true, false) => { - tmp_rhs = ct_right.duplicate_async(stream); - self.full_propagate_assign_async(&mut tmp_rhs, stream); + tmp_rhs = ct_right.duplicate_async(streams); + self.full_propagate_assign_async(&mut tmp_rhs, streams); (ct_left, &tmp_rhs) } (false, true) => { - self.full_propagate_assign_async(ct_left, stream); + self.full_propagate_assign_async(ct_left, streams); (ct_left, ct_right) } (false, false) => { - tmp_rhs = ct_right.duplicate_async(stream); + tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(ct_left, stream); - self.full_propagate_assign_async(&mut tmp_rhs, stream); + self.full_propagate_assign_async(ct_left, streams); + self.full_propagate_assign_async(&mut tmp_rhs, streams); (ct_left, &tmp_rhs) } }; - self.unchecked_bitop_assign_async(lhs, rhs, BitOpType::Or, stream); + self.unchecked_bitop_assign_async(lhs, rhs, BitOpType::Or, streams); } pub fn bitor_assign( &self, ct_left: &mut T, ct_right: &T, - stream: &CudaStreams, + streams: &CudaStreams, ) { unsafe { - self.bitor_assign_async(ct_left, ct_right, stream); + self.bitor_assign_async(ct_left, ct_right, streams); } - stream.synchronize(); + streams.synchronize(); } /// Computes homomorphically bitxor between two ciphertexts encrypting integer values. @@ -611,11 +618,11 @@ impl CudaServerKey { /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS; /// /// let gpu_index = 0; - /// let mut stream = CudaStreams::new_single_gpu(gpu_index); + /// let mut streams = CudaStreams::new_single_gpu(gpu_index); /// /// // We have 4 * 2 = 8 bits of message /// let size = 4; - /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream); + /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams); /// /// let msg1 = 201u64; /// let msg2 = 1u64; @@ -624,14 +631,14 @@ impl CudaServerKey { /// let ct2 = cks.encrypt(msg2); /// /// // Copy to GPU - /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut stream); - /// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut stream); + /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut streams); + /// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut streams); /// /// // Compute homomorphically a bitwise and: - /// let d_ct_res = sks.bitxor(&d_ct1, &d_ct2, &mut stream); + /// let d_ct_res = sks.bitxor(&d_ct1, &d_ct2, &mut streams); /// /// // Copy back to CPU - /// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream); + /// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams); /// /// // Decrypt: /// let dec: u64 = cks.decrypt(&ct_res); @@ -641,22 +648,22 @@ impl CudaServerKey { &self, ct_left: &T, ct_right: &T, - stream: &CudaStreams, + streams: &CudaStreams, ) -> T { - let mut result = unsafe { ct_left.duplicate_async(stream) }; - self.bitxor_assign(&mut result, ct_right, stream); + let mut result = unsafe { ct_left.duplicate_async(streams) }; + self.bitxor_assign(&mut result, ct_right, streams); result } /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn bitxor_assign_async( &self, ct_left: &mut T, ct_right: &T, - stream: &CudaStreams, + streams: &CudaStreams, ) { let mut tmp_rhs; @@ -666,36 +673,36 @@ impl CudaServerKey { ) { (true, true) => (ct_left, ct_right), (true, false) => { - tmp_rhs = ct_right.duplicate_async(stream); - self.full_propagate_assign_async(&mut tmp_rhs, stream); + tmp_rhs = ct_right.duplicate_async(streams); + self.full_propagate_assign_async(&mut tmp_rhs, streams); (ct_left, &tmp_rhs) } (false, true) => { - self.full_propagate_assign_async(ct_left, stream); + self.full_propagate_assign_async(ct_left, streams); (ct_left, ct_right) } (false, false) => { - tmp_rhs = ct_right.duplicate_async(stream); + tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(ct_left, stream); - self.full_propagate_assign_async(&mut tmp_rhs, stream); + self.full_propagate_assign_async(ct_left, streams); + self.full_propagate_assign_async(&mut tmp_rhs, streams); (ct_left, &tmp_rhs) } }; - self.unchecked_bitop_assign_async(lhs, rhs, BitOpType::Xor, stream); + self.unchecked_bitop_assign_async(lhs, rhs, BitOpType::Xor, streams); } pub fn bitxor_assign( &self, ct_left: &mut T, ct_right: &T, - stream: &CudaStreams, + streams: &CudaStreams, ) { unsafe { - self.bitxor_assign_async(ct_left, ct_right, stream); + self.bitxor_assign_async(ct_left, ct_right, streams); } - stream.synchronize(); + streams.synchronize(); } /// Computes homomorphically bitnot for an encrypted integer value. @@ -716,55 +723,55 @@ impl CudaServerKey { /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS; /// /// let gpu_index = 0; - /// let mut stream = CudaStreams::new_single_gpu(gpu_index); + /// let mut streams = CudaStreams::new_single_gpu(gpu_index); /// /// // We have 4 * 2 = 8 bits of message /// let size = 4; - /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream); + /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams); /// /// let msg = 1u64; /// /// let ct = cks.encrypt(msg); /// /// // Copy to GPU - /// let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &mut stream); + /// let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &mut streams); /// /// // Compute homomorphically a bitwise and: - /// let d_ct_res = sks.bitnot(&d_ct, &mut stream); + /// let d_ct_res = sks.bitnot(&d_ct, &mut streams); /// /// // Copy back to CPU - /// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream); + /// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams); /// /// // Decrypt: /// let dec: u64 = cks.decrypt(&ct_res); /// assert_eq!(dec, !msg % 256); /// ``` - pub fn bitnot(&self, ct: &T, stream: &CudaStreams) -> T { - let mut result = unsafe { ct.duplicate_async(stream) }; - self.bitnot_assign(&mut result, stream); + pub fn bitnot(&self, ct: &T, streams: &CudaStreams) -> T { + let mut result = unsafe { ct.duplicate_async(streams) }; + self.bitnot_assign(&mut result, streams); result } /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn bitnot_assign_async( &self, ct: &mut T, - stream: &CudaStreams, + streams: &CudaStreams, ) { if !ct.block_carries_are_empty() { - self.full_propagate_assign_async(ct, stream); + self.full_propagate_assign_async(ct, streams); } - self.unchecked_bitnot_assign_async(ct, stream); + self.unchecked_bitnot_assign_async(ct, streams); } - pub fn bitnot_assign(&self, ct: &mut T, stream: &CudaStreams) { + pub fn bitnot_assign(&self, ct: &mut T, streams: &CudaStreams) { unsafe { - self.bitnot_assign_async(ct, stream); + self.bitnot_assign_async(ct, streams); } - stream.synchronize(); + streams.synchronize(); } } diff --git a/tfhe/src/integer/gpu/server_key/radix/mod.rs b/tfhe/src/integer/gpu/server_key/radix/mod.rs index fcd1f52848..bf7c84b705 100644 --- a/tfhe/src/integer/gpu/server_key/radix/mod.rs +++ b/tfhe/src/integer/gpu/server_key/radix/mod.rs @@ -327,25 +327,26 @@ impl CudaServerKey { &self, ct: &T, num_blocks: usize, - stream: &CudaStreams, + streams: &CudaStreams, ) -> T { let new_num_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0 + num_blocks; let ciphertext_modulus = ct.as_ref().d_blocks.ciphertext_modulus(); let lwe_size = ct.as_ref().d_blocks.lwe_dimension().to_lwe_size(); let shift = num_blocks * lwe_size.0; - let mut extended_ct_vec = - unsafe { CudaVec::new_async(new_num_blocks * lwe_size.0, stream, 0) }; + let mut extended_ct_vec = unsafe { + CudaVec::new_async(new_num_blocks * lwe_size.0, streams, streams.gpu_indexes[0]) + }; unsafe { - extended_ct_vec.memset_async(0u64, stream, 0); + extended_ct_vec.memset_async(0u64, streams, streams.gpu_indexes[0]); extended_ct_vec.copy_self_range_gpu_to_gpu_async( shift.., &ct.as_ref().d_blocks.0.d_vec, - stream, + streams, 0, ); } - stream.synchronize(); + streams.synchronize(); let extended_ct_list = CudaLweCiphertextList::from_cuda_vec( extended_ct_vec, LweCiphertextCount(new_num_blocks), @@ -398,19 +399,24 @@ impl CudaServerKey { &self, ct: &T, num_blocks: usize, - stream: &CudaStreams, + streams: &CudaStreams, ) -> T { let new_num_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0 + num_blocks; let ciphertext_modulus = ct.as_ref().d_blocks.ciphertext_modulus(); let lwe_size = ct.as_ref().d_blocks.lwe_dimension().to_lwe_size(); - let mut extended_ct_vec = - unsafe { CudaVec::new_async(new_num_blocks * lwe_size.0, stream, 0) }; + let mut extended_ct_vec = unsafe { + CudaVec::new_async(new_num_blocks * lwe_size.0, streams, streams.gpu_indexes[0]) + }; unsafe { - extended_ct_vec.memset_async(0u64, stream, 0); - extended_ct_vec.copy_from_gpu_async(&ct.as_ref().d_blocks.0.d_vec, stream, 0); + extended_ct_vec.memset_async(0u64, streams, streams.gpu_indexes[0]); + extended_ct_vec.copy_from_gpu_async( + &ct.as_ref().d_blocks.0.d_vec, + streams, + streams.gpu_indexes[0], + ); } - stream.synchronize(); + streams.synchronize(); let extended_ct_list = CudaLweCiphertextList::from_cuda_vec( extended_ct_vec, LweCiphertextCount(new_num_blocks), @@ -463,24 +469,25 @@ impl CudaServerKey { &self, ct: &T, num_blocks: usize, - stream: &CudaStreams, + streams: &CudaStreams, ) -> T { let new_num_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0 - num_blocks; let ciphertext_modulus = ct.as_ref().d_blocks.ciphertext_modulus(); let lwe_size = ct.as_ref().d_blocks.lwe_dimension().to_lwe_size(); let shift = num_blocks * lwe_size.0; - let mut trimmed_ct_vec = - unsafe { CudaVec::new_async(new_num_blocks * lwe_size.0, stream, 0) }; + let mut trimmed_ct_vec = unsafe { + CudaVec::new_async(new_num_blocks * lwe_size.0, streams, streams.gpu_indexes[0]) + }; unsafe { trimmed_ct_vec.copy_src_range_gpu_to_gpu_async( shift.., &ct.as_ref().d_blocks.0.d_vec, - stream, + streams, 0, ); } - stream.synchronize(); + streams.synchronize(); let trimmed_ct_list = CudaLweCiphertextList::from_cuda_vec( trimmed_ct_vec, LweCiphertextCount(new_num_blocks), @@ -530,24 +537,25 @@ impl CudaServerKey { &self, ct: &T, num_blocks: usize, - stream: &CudaStreams, + streams: &CudaStreams, ) -> T { let new_num_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0 - num_blocks; let ciphertext_modulus = ct.as_ref().d_blocks.ciphertext_modulus(); let lwe_size = ct.as_ref().d_blocks.lwe_dimension().to_lwe_size(); let shift = new_num_blocks * lwe_size.0; - let mut trimmed_ct_vec = - unsafe { CudaVec::new_async(new_num_blocks * lwe_size.0, stream, 0) }; + let mut trimmed_ct_vec = unsafe { + CudaVec::new_async(new_num_blocks * lwe_size.0, streams, streams.gpu_indexes[0]) + }; unsafe { trimmed_ct_vec.copy_src_range_gpu_to_gpu_async( 0..shift, &ct.as_ref().d_blocks.0.d_vec, - stream, + streams, 0, ); } - stream.synchronize(); + streams.synchronize(); let trimmed_ct_list = CudaLweCiphertextList::from_cuda_vec( trimmed_ct_vec, LweCiphertextCount(new_num_blocks), @@ -594,7 +602,7 @@ impl CudaServerKey { &self, ct: &T, num_blocks: usize, - stream: &CudaStreams, + streams: &CudaStreams, ) -> T { let message_modulus = self.message_modulus.0 as u64; let num_bits_in_block = message_modulus.ilog2(); @@ -612,28 +620,40 @@ impl CudaServerKey { let lwe_size = ct.as_ref().d_blocks.0.lwe_dimension.to_lwe_size().0; // Allocate the necessary amount of memory - let mut output_radix = CudaVec::new(new_num_ct_blocks * lwe_size, stream, 0); + let mut output_radix = CudaVec::new( + new_num_ct_blocks * lwe_size, + streams, + streams.gpu_indexes[0], + ); unsafe { - output_radix.copy_from_gpu_async(&ct.as_ref().d_blocks.0.d_vec, stream, 0); + output_radix.copy_from_gpu_async( + &ct.as_ref().d_blocks.0.d_vec, + streams, + streams.gpu_indexes[0], + ); // Get the last ct block let last_block = ct .as_ref() .d_blocks .0 .d_vec - .as_slice(lwe_size * (num_ct_blocks - 1).., 0) + .as_slice(lwe_size * (num_ct_blocks - 1).., streams.gpu_indexes[0]) .unwrap(); let mut output_slice = output_radix - .as_mut_slice(lwe_size * num_ct_blocks..lwe_size * new_num_ct_blocks, 0) + .as_mut_slice( + lwe_size * num_ct_blocks..lwe_size * new_num_ct_blocks, + streams.gpu_indexes[0], + ) .unwrap(); - let (padding_block, new_blocks) = output_slice.split_at_mut(lwe_size, 0); + let (padding_block, new_blocks) = + output_slice.split_at_mut(lwe_size, streams.gpu_indexes[0]); let mut padding_block = padding_block.unwrap(); let mut new_blocks = new_blocks.unwrap(); match &self.bootstrapping_key { CudaBootstrappingKey::Classic(d_bsk) => { apply_univariate_lut_kb_async( - stream, + streams, &mut padding_block, &last_block, padding_block_creator_lut.acc.as_ref(), @@ -657,7 +677,7 @@ impl CudaServerKey { } CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { apply_univariate_lut_kb_async( - stream, + streams, &mut padding_block, &last_block, padding_block_creator_lut.acc.as_ref(), @@ -682,12 +702,12 @@ impl CudaServerKey { } for i in 0..num_blocks - 1 { let mut output_block = new_blocks - .get_mut(lwe_size * i..lwe_size * (i + 1), 0) + .get_mut(lwe_size * i..lwe_size * (i + 1), streams.gpu_indexes[0]) .unwrap(); - output_block.copy_from_gpu_async(&padding_block, stream, 0); + output_block.copy_from_gpu_async(&padding_block, streams, streams.gpu_indexes[0]); } } - stream.synchronize(); + streams.synchronize(); let output_lwe_list = CudaLweCiphertextList(CudaLweList { d_vec: output_radix, lwe_ciphertext_count: LweCiphertextCount(new_num_ct_blocks), diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs index 4211f860da..9afa8ba40c 100644 --- a/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs +++ b/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs @@ -11,14 +11,14 @@ use crate::integer::gpu::{ impl CudaServerKey { /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn unchecked_scalar_bitop_assign_async( &self, ct: &mut T, rhs: Scalar, op: BitOpType, - stream: &CudaStreams, + streams: &CudaStreams, ) where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, @@ -31,12 +31,13 @@ impl CudaServerKey { .map(|x| x as u64) .collect::>(); - let clear_blocks = CudaVec::from_cpu_async(&h_clear_blocks, stream, 0); + let clear_blocks = + CudaVec::from_cpu_async(&h_clear_blocks, streams, streams.gpu_indexes[0]); match &self.bootstrapping_key { CudaBootstrappingKey::Classic(d_bsk) => { unchecked_scalar_bitop_integer_radix_kb_assign_async( - stream, + streams, &mut ct.as_mut().d_blocks.0.d_vec, &clear_blocks, &d_bsk.d_vec, @@ -63,7 +64,7 @@ impl CudaServerKey { } CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { unchecked_scalar_bitop_integer_radix_kb_assign_async( - stream, + streams, &mut ct.as_mut().d_blocks.0.d_vec, &clear_blocks, &d_multibit_bsk.d_vec, @@ -91,13 +92,18 @@ impl CudaServerKey { } } - pub fn unchecked_scalar_bitand(&self, ct: &T, rhs: Scalar, stream: &CudaStreams) -> T + pub fn unchecked_scalar_bitand( + &self, + ct: &T, + rhs: Scalar, + streams: &CudaStreams, + ) -> T where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { - let mut result = unsafe { ct.duplicate_async(stream) }; - self.unchecked_scalar_bitand_assign(&mut result, rhs, stream); + let mut result = unsafe { ct.duplicate_async(streams) }; + self.unchecked_scalar_bitand_assign(&mut result, rhs, streams); result } @@ -105,25 +111,25 @@ impl CudaServerKey { &self, ct: &mut T, rhs: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { unsafe { - self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarAnd, stream); + self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarAnd, streams); ct.as_mut().info = ct.as_ref().info.after_scalar_bitand(rhs); } - stream.synchronize(); + streams.synchronize(); } - pub fn unchecked_scalar_bitor(&self, ct: &T, rhs: Scalar, stream: &CudaStreams) -> T + pub fn unchecked_scalar_bitor(&self, ct: &T, rhs: Scalar, streams: &CudaStreams) -> T where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { - let mut result = unsafe { ct.duplicate_async(stream) }; - self.unchecked_scalar_bitor_assign(&mut result, rhs, stream); + let mut result = unsafe { ct.duplicate_async(streams) }; + self.unchecked_scalar_bitor_assign(&mut result, rhs, streams); result } @@ -131,25 +137,30 @@ impl CudaServerKey { &self, ct: &mut T, rhs: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { unsafe { - self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarOr, stream); + self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarOr, streams); ct.as_mut().info = ct.as_ref().info.after_scalar_bitor(rhs); } - stream.synchronize(); + streams.synchronize(); } - pub fn unchecked_scalar_bitxor(&self, ct: &T, rhs: Scalar, stream: &CudaStreams) -> T + pub fn unchecked_scalar_bitxor( + &self, + ct: &T, + rhs: Scalar, + streams: &CudaStreams, + ) -> T where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { - let mut result = unsafe { ct.duplicate_async(stream) }; - self.unchecked_scalar_bitxor_assign(&mut result, rhs, stream); + let mut result = unsafe { ct.duplicate_async(streams) }; + self.unchecked_scalar_bitxor_assign(&mut result, rhs, streams); result } @@ -157,138 +168,138 @@ impl CudaServerKey { &self, ct: &mut T, rhs: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { unsafe { - self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarXor, stream); + self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarXor, streams); ct.as_mut().info = ct.as_ref().info.after_scalar_bitxor(rhs); } - stream.synchronize(); + streams.synchronize(); } /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn scalar_bitand_assign_async( &self, ct: &mut T, rhs: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { if !ct.block_carries_are_empty() { - self.full_propagate_assign_async(ct, stream); + self.full_propagate_assign_async(ct, streams); } - self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarAnd, stream); + self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarAnd, streams); ct.as_mut().info = ct.as_ref().info.after_scalar_bitand(rhs); } - pub fn scalar_bitand_assign(&self, ct: &mut T, rhs: Scalar, stream: &CudaStreams) + pub fn scalar_bitand_assign(&self, ct: &mut T, rhs: Scalar, streams: &CudaStreams) where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { unsafe { - self.scalar_bitand_assign_async(ct, rhs, stream); + self.scalar_bitand_assign_async(ct, rhs, streams); } - stream.synchronize(); + streams.synchronize(); } - pub fn scalar_bitand(&self, ct: &T, rhs: Scalar, stream: &CudaStreams) -> T + pub fn scalar_bitand(&self, ct: &T, rhs: Scalar, streams: &CudaStreams) -> T where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { - let mut result = unsafe { ct.duplicate_async(stream) }; - self.scalar_bitand_assign(&mut result, rhs, stream); + let mut result = unsafe { ct.duplicate_async(streams) }; + self.scalar_bitand_assign(&mut result, rhs, streams); result } /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn scalar_bitor_assign_async( &self, ct: &mut T, rhs: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { if !ct.block_carries_are_empty() { - self.full_propagate_assign_async(ct, stream); + self.full_propagate_assign_async(ct, streams); } - self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarOr, stream); + self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarOr, streams); ct.as_mut().info = ct.as_ref().info.after_scalar_bitor(rhs); } - pub fn scalar_bitor_assign(&self, ct: &mut T, rhs: Scalar, stream: &CudaStreams) + pub fn scalar_bitor_assign(&self, ct: &mut T, rhs: Scalar, streams: &CudaStreams) where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { unsafe { - self.scalar_bitor_assign_async(ct, rhs, stream); + self.scalar_bitor_assign_async(ct, rhs, streams); } - stream.synchronize(); + streams.synchronize(); } - pub fn scalar_bitor(&self, ct: &T, rhs: Scalar, stream: &CudaStreams) -> T + pub fn scalar_bitor(&self, ct: &T, rhs: Scalar, streams: &CudaStreams) -> T where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { - let mut result = unsafe { ct.duplicate_async(stream) }; - self.scalar_bitor_assign(&mut result, rhs, stream); + let mut result = unsafe { ct.duplicate_async(streams) }; + self.scalar_bitor_assign(&mut result, rhs, streams); result } /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn scalar_bitxor_assign_async( &self, ct: &mut T, rhs: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { if !ct.block_carries_are_empty() { - self.full_propagate_assign_async(ct, stream); + self.full_propagate_assign_async(ct, streams); } - self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarXor, stream); + self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarXor, streams); ct.as_mut().info = ct.as_ref().info.after_scalar_bitxor(rhs); } - pub fn scalar_bitxor_assign(&self, ct: &mut T, rhs: Scalar, stream: &CudaStreams) + pub fn scalar_bitxor_assign(&self, ct: &mut T, rhs: Scalar, streams: &CudaStreams) where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { unsafe { - self.scalar_bitxor_assign_async(ct, rhs, stream); + self.scalar_bitxor_assign_async(ct, rhs, streams); } - stream.synchronize(); + streams.synchronize(); } - pub fn scalar_bitxor(&self, ct: &T, rhs: Scalar, stream: &CudaStreams) -> T + pub fn scalar_bitxor(&self, ct: &T, rhs: Scalar, streams: &CudaStreams) -> T where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { - let mut result = unsafe { ct.duplicate_async(stream) }; - self.scalar_bitxor_assign(&mut result, rhs, stream); + let mut result = unsafe { ct.duplicate_async(streams) }; + self.scalar_bitxor_assign(&mut result, rhs, streams); result } } diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs index e3cc0a5c7f..5b6639082f 100644 --- a/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs +++ b/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs @@ -102,15 +102,15 @@ impl CudaServerKey { /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn unchecked_signed_and_unsigned_scalar_comparison_async( &self, ct: &T, scalar: Scalar, op: ComparisonType, signed_with_positive_scalar: bool, - stream: &CudaStreams, + streams: &CudaStreams, ) -> CudaBooleanBlock where Scalar: DecomposableInto, @@ -122,7 +122,7 @@ impl CudaServerKey { ComparisonType::GT | ComparisonType::GE | ComparisonType::NE => 1, _ => 0, }; - let ct_res: T = self.create_trivial_radix(value, 1, stream); + let ct_res: T = self.create_trivial_radix(value, 1, streams); return CudaBooleanBlock::from_cuda_radix_ciphertext(ct_res.into_inner()); } @@ -144,7 +144,7 @@ impl CudaServerKey { ComparisonType::LT | ComparisonType::LE | ComparisonType::NE => 1, _ => 0, }; - let ct_res: T = self.create_trivial_radix(value, 1, stream); + let ct_res: T = self.create_trivial_radix(value, 1, streams); return CudaBooleanBlock::from_cuda_radix_ciphertext(ct_res.into_inner()); } @@ -153,7 +153,8 @@ impl CudaServerKey { // as we will handle them separately. scalar_blocks.truncate(ct.as_ref().d_blocks.lwe_ciphertext_count().0); - let d_scalar_blocks: CudaVec = CudaVec::from_cpu_async(&scalar_blocks, stream, 0); + let d_scalar_blocks: CudaVec = + CudaVec::from_cpu_async(&scalar_blocks, streams, streams.gpu_indexes[0]); let lwe_ciphertext_count = ct.as_ref().d_blocks.lwe_ciphertext_count(); @@ -161,7 +162,7 @@ impl CudaServerKey { ct.as_ref().d_blocks.lwe_dimension(), LweCiphertextCount(1), CiphertextModulus::new_native(), - stream, + streams, ); let mut block_info = ct.as_ref().info.blocks[0]; block_info.degree = Degree::new(0); @@ -174,7 +175,7 @@ impl CudaServerKey { match &self.bootstrapping_key { CudaBootstrappingKey::Classic(d_bsk) => { unchecked_scalar_comparison_integer_radix_kb_async( - stream, + streams, &mut result.as_mut().ciphertext.d_blocks.0.d_vec, &ct.as_ref().d_blocks.0.d_vec, &d_scalar_blocks, @@ -204,7 +205,7 @@ impl CudaServerKey { } CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { unchecked_scalar_comparison_integer_radix_kb_async( - stream, + streams, &mut result.as_mut().ciphertext.d_blocks.0.d_vec, &ct.as_ref().d_blocks.0.d_vec, &d_scalar_blocks, @@ -239,14 +240,14 @@ impl CudaServerKey { /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn unchecked_scalar_comparison_async( &self, ct: &T, scalar: Scalar, op: ComparisonType, - stream: &CudaStreams, + streams: &CudaStreams, ) -> CudaBooleanBlock where Scalar: DecomposableInto, @@ -260,12 +261,12 @@ impl CudaServerKey { // Scalar is greater than the bounds, so ciphertext is smaller let result: T = match op { ComparisonType::LT | ComparisonType::LE => { - self.create_trivial_radix(1, num_blocks, stream) + self.create_trivial_radix(1, num_blocks, streams) } _ => self.create_trivial_radix( 0, ct.as_ref().d_blocks.lwe_ciphertext_count().0, - stream, + streams, ), }; return CudaBooleanBlock::from_cuda_radix_ciphertext(result.into_inner()); @@ -274,12 +275,12 @@ impl CudaServerKey { // Scalar is smaller than the bounds, so ciphertext is bigger let result: T = match op { ComparisonType::GT | ComparisonType::GE => { - self.create_trivial_radix(1, num_blocks, stream) + self.create_trivial_radix(1, num_blocks, streams) } _ => self.create_trivial_radix( 0, ct.as_ref().d_blocks.lwe_ciphertext_count().0, - stream, + streams, ), }; return CudaBooleanBlock::from_cuda_radix_ciphertext(result.into_inner()); @@ -292,29 +293,29 @@ impl CudaServerKey { if scalar >= Scalar::ZERO { self.unchecked_signed_and_unsigned_scalar_comparison_async( - ct, scalar, op, true, stream, + ct, scalar, op, true, streams, ) } else { - let scalar_as_trivial = self.create_trivial_radix(scalar, num_blocks, stream); - self.unchecked_comparison_async(ct, &scalar_as_trivial, op, stream) + let scalar_as_trivial = self.create_trivial_radix(scalar, num_blocks, streams); + self.unchecked_comparison_async(ct, &scalar_as_trivial, op, streams) } } else { // Unsigned self.unchecked_signed_and_unsigned_scalar_comparison_async( - ct, scalar, op, false, stream, + ct, scalar, op, false, streams, ) } } /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn unchecked_scalar_minmax_async( &self, ct: &T, scalar: Scalar, op: ComparisonType, - stream: &CudaStreams, + streams: &CudaStreams, ) -> T where T: CudaIntegerRadixCiphertext, @@ -327,16 +328,17 @@ impl CudaServerKey { .iter_as::() .collect::>(); - let d_scalar_blocks: CudaVec = CudaVec::from_cpu_async(&scalar_blocks, stream, 0); + let d_scalar_blocks: CudaVec = + CudaVec::from_cpu_async(&scalar_blocks, streams, streams.gpu_indexes[0]); let lwe_ciphertext_count = ct.as_ref().d_blocks.lwe_ciphertext_count(); - let mut result = ct.duplicate_async(stream); + let mut result = ct.duplicate_async(streams); match &self.bootstrapping_key { CudaBootstrappingKey::Classic(d_bsk) => { unchecked_scalar_comparison_integer_radix_kb_async( - stream, + streams, &mut result.as_mut().d_blocks.0.d_vec, &ct.as_ref().d_blocks.0.d_vec, &d_scalar_blocks, @@ -366,7 +368,7 @@ impl CudaServerKey { } CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { unchecked_scalar_comparison_integer_radix_kb_async( - stream, + streams, &mut result.as_mut().d_blocks.0.d_vec, &ct.as_ref().d_blocks.0.d_vec, &d_scalar_blocks, @@ -401,45 +403,45 @@ impl CudaServerKey { /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn unchecked_scalar_eq_async( &self, ct: &T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) -> CudaBooleanBlock where T: CudaIntegerRadixCiphertext, Scalar: DecomposableInto, { - self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::EQ, stream) + self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::EQ, streams) } pub fn unchecked_scalar_eq( &self, ct: &T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) -> CudaBooleanBlock where T: CudaIntegerRadixCiphertext, Scalar: DecomposableInto, { - let result = unsafe { self.unchecked_scalar_eq_async(ct, scalar, stream) }; - stream.synchronize(); + let result = unsafe { self.unchecked_scalar_eq_async(ct, scalar, streams) }; + streams.synchronize(); result } /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn scalar_eq_async( &self, ct: &T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) -> CudaBooleanBlock where T: CudaIntegerRadixCiphertext, @@ -449,12 +451,12 @@ impl CudaServerKey { let lhs = if ct.block_carries_are_empty() { ct } else { - tmp_lhs = ct.duplicate_async(stream); - self.full_propagate_assign_async(&mut tmp_lhs, stream); + tmp_lhs = ct.duplicate_async(streams); + self.full_propagate_assign_async(&mut tmp_lhs, streams); &tmp_lhs }; - self.unchecked_scalar_eq_async(lhs, scalar, stream) + self.unchecked_scalar_eq_async(lhs, scalar, streams) } /// Compares for equality 2 ciphertexts @@ -473,12 +475,12 @@ impl CudaServerKey { /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS; /// /// let gpu_index = 0; - /// let mut stream = CudaStreams::new_single_gpu(gpu_index); + /// let mut streams = CudaStreams::new_single_gpu(gpu_index); /// /// let size = 4; /// /// // Generate the client key and the server key: - /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &stream); + /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &streams); /// /// let msg1 = 14u64; /// let msg2 = 97u64; @@ -486,12 +488,12 @@ impl CudaServerKey { /// let ct1 = cks.encrypt(msg1); /// /// // Copy to GPU - /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &stream); + /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &streams); /// - /// let d_ct_res = sks.scalar_eq(&d_ct1, msg2, &stream); + /// let d_ct_res = sks.scalar_eq(&d_ct1, msg2, &streams); /// /// // Copy the result back to CPU - /// let ct_res = d_ct_res.to_boolean_block(&stream); + /// let ct_res = d_ct_res.to_boolean_block(&streams); /// /// // Decrypt: /// let dec_result = cks.decrypt_bool(&ct_res); @@ -501,26 +503,26 @@ impl CudaServerKey { &self, ct: &T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) -> CudaBooleanBlock where T: CudaIntegerRadixCiphertext, Scalar: DecomposableInto, { - let result = unsafe { self.scalar_eq_async(ct, scalar, stream) }; - stream.synchronize(); + let result = unsafe { self.scalar_eq_async(ct, scalar, streams) }; + streams.synchronize(); result } /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn scalar_ne_async( &self, ct: &T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) -> CudaBooleanBlock where T: CudaIntegerRadixCiphertext, @@ -530,12 +532,12 @@ impl CudaServerKey { let lhs = if ct.block_carries_are_empty() { ct } else { - tmp_lhs = ct.duplicate_async(stream); - self.full_propagate_assign_async(&mut tmp_lhs, stream); + tmp_lhs = ct.duplicate_async(streams); + self.full_propagate_assign_async(&mut tmp_lhs, streams); &tmp_lhs }; - self.unchecked_scalar_ne_async(lhs, scalar, stream) + self.unchecked_scalar_ne_async(lhs, scalar, streams) } /// Compares for equality 2 ciphertexts @@ -554,12 +556,12 @@ impl CudaServerKey { /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS; /// /// let gpu_index = 0; - /// let mut stream = CudaStreams::new_single_gpu(gpu_index); + /// let mut streams = CudaStreams::new_single_gpu(gpu_index); /// /// let size = 4; /// /// // Generate the client key and the server key: - /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &stream); + /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &streams); /// /// let msg1 = 14u64; /// let msg2 = 97u64; @@ -567,12 +569,12 @@ impl CudaServerKey { /// let ct1 = cks.encrypt(msg1); /// /// // Copy to GPU - /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &stream); + /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &streams); /// - /// let d_ct_res = sks.scalar_ne(&d_ct1, msg2, &stream); + /// let d_ct_res = sks.scalar_ne(&d_ct1, msg2, &streams); /// /// // Copy the result back to CPU - /// let ct_res = d_ct_res.to_boolean_block(&stream); + /// let ct_res = d_ct_res.to_boolean_block(&streams); /// /// // Decrypt: /// let dec_result = cks.decrypt_bool(&ct_res); @@ -582,185 +584,185 @@ impl CudaServerKey { &self, ct: &T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) -> CudaBooleanBlock where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { - let result = unsafe { self.scalar_ne_async(ct, scalar, stream) }; - stream.synchronize(); + let result = unsafe { self.scalar_ne_async(ct, scalar, streams) }; + streams.synchronize(); result } /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn unchecked_scalar_ne_async( &self, ct: &T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) -> CudaBooleanBlock where T: CudaIntegerRadixCiphertext, Scalar: DecomposableInto, { - self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::NE, stream) + self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::NE, streams) } pub fn unchecked_scalar_ne( &self, ct: &T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) -> CudaBooleanBlock where T: CudaIntegerRadixCiphertext, Scalar: DecomposableInto, { - let result = unsafe { self.unchecked_scalar_ne_async(ct, scalar, stream) }; - stream.synchronize(); + let result = unsafe { self.unchecked_scalar_ne_async(ct, scalar, streams) }; + streams.synchronize(); result } /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn unchecked_scalar_gt_async( &self, ct: &T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) -> CudaBooleanBlock where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { - self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::GT, stream) + self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::GT, streams) } pub fn unchecked_scalar_gt( &self, ct: &T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) -> CudaBooleanBlock where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { - let result = unsafe { self.unchecked_scalar_gt_async(ct, scalar, stream) }; - stream.synchronize(); + let result = unsafe { self.unchecked_scalar_gt_async(ct, scalar, streams) }; + streams.synchronize(); result } /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn unchecked_scalar_ge_async( &self, ct: &T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) -> CudaBooleanBlock where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { - self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::GE, stream) + self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::GE, streams) } pub fn unchecked_scalar_ge( &self, ct: &T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) -> CudaBooleanBlock where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { - let result = unsafe { self.unchecked_scalar_ge_async(ct, scalar, stream) }; - stream.synchronize(); + let result = unsafe { self.unchecked_scalar_ge_async(ct, scalar, streams) }; + streams.synchronize(); result } /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn unchecked_scalar_lt_async( &self, ct: &T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) -> CudaBooleanBlock where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { - self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::LT, stream) + self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::LT, streams) } pub fn unchecked_scalar_lt( &self, ct: &T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) -> CudaBooleanBlock where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { - let result = unsafe { self.unchecked_scalar_lt_async(ct, scalar, stream) }; - stream.synchronize(); + let result = unsafe { self.unchecked_scalar_lt_async(ct, scalar, streams) }; + streams.synchronize(); result } /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn unchecked_scalar_le_async( &self, ct: &T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) -> CudaBooleanBlock where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { - self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::LE, stream) + self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::LE, streams) } pub fn unchecked_scalar_le( &self, ct: &T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) -> CudaBooleanBlock where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { - let result = unsafe { self.unchecked_scalar_le_async(ct, scalar, stream) }; - stream.synchronize(); + let result = unsafe { self.unchecked_scalar_le_async(ct, scalar, streams) }; + streams.synchronize(); result } /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn scalar_gt_async( &self, ct: &T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) -> CudaBooleanBlock where Scalar: DecomposableInto, @@ -770,38 +772,38 @@ impl CudaServerKey { let lhs = if ct.block_carries_are_empty() { ct } else { - tmp_lhs = ct.duplicate_async(stream); - self.full_propagate_assign_async(&mut tmp_lhs, stream); + tmp_lhs = ct.duplicate_async(streams); + self.full_propagate_assign_async(&mut tmp_lhs, streams); &tmp_lhs }; - self.unchecked_scalar_gt_async(lhs, scalar, stream) + self.unchecked_scalar_gt_async(lhs, scalar, streams) } pub fn scalar_gt( &self, ct: &T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) -> CudaBooleanBlock where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { - let result = unsafe { self.scalar_gt_async(ct, scalar, stream) }; - stream.synchronize(); + let result = unsafe { self.scalar_gt_async(ct, scalar, streams) }; + streams.synchronize(); result } /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn scalar_ge_async( &self, ct: &T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) -> CudaBooleanBlock where Scalar: DecomposableInto, @@ -811,38 +813,38 @@ impl CudaServerKey { let lhs = if ct.block_carries_are_empty() { ct } else { - tmp_lhs = ct.duplicate_async(stream); - self.full_propagate_assign_async(&mut tmp_lhs, stream); + tmp_lhs = ct.duplicate_async(streams); + self.full_propagate_assign_async(&mut tmp_lhs, streams); &tmp_lhs }; - self.unchecked_scalar_ge_async(lhs, scalar, stream) + self.unchecked_scalar_ge_async(lhs, scalar, streams) } pub fn scalar_ge( &self, ct: &T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) -> CudaBooleanBlock where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { - let result = unsafe { self.scalar_ge_async(ct, scalar, stream) }; - stream.synchronize(); + let result = unsafe { self.scalar_ge_async(ct, scalar, streams) }; + streams.synchronize(); result } /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn scalar_lt_async( &self, ct: &T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) -> CudaBooleanBlock where Scalar: DecomposableInto, @@ -852,37 +854,37 @@ impl CudaServerKey { let lhs = if ct.block_carries_are_empty() { ct } else { - tmp_lhs = ct.duplicate_async(stream); - self.full_propagate_assign_async(&mut tmp_lhs, stream); + tmp_lhs = ct.duplicate_async(streams); + self.full_propagate_assign_async(&mut tmp_lhs, streams); &tmp_lhs }; - self.unchecked_scalar_lt_async(lhs, scalar, stream) + self.unchecked_scalar_lt_async(lhs, scalar, streams) } pub fn scalar_lt( &self, ct: &T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) -> CudaBooleanBlock where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { - let result = unsafe { self.scalar_lt_async(ct, scalar, stream) }; - stream.synchronize(); + let result = unsafe { self.scalar_lt_async(ct, scalar, streams) }; + streams.synchronize(); result } /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn scalar_le_async( &self, ct: &T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) -> CudaBooleanBlock where Scalar: DecomposableInto, @@ -892,92 +894,102 @@ impl CudaServerKey { let lhs = if ct.block_carries_are_empty() { ct } else { - tmp_lhs = ct.duplicate_async(stream); - self.full_propagate_assign_async(&mut tmp_lhs, stream); + tmp_lhs = ct.duplicate_async(streams); + self.full_propagate_assign_async(&mut tmp_lhs, streams); &tmp_lhs }; - self.unchecked_scalar_le_async(lhs, scalar, stream) + self.unchecked_scalar_le_async(lhs, scalar, streams) } pub fn scalar_le( &self, ct: &T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) -> CudaBooleanBlock where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { - let result = unsafe { self.scalar_le_async(ct, scalar, stream) }; - stream.synchronize(); + let result = unsafe { self.scalar_le_async(ct, scalar, streams) }; + streams.synchronize(); result } /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn unchecked_scalar_max_async( &self, ct: &T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) -> T where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { - self.unchecked_scalar_minmax_async(ct, scalar, ComparisonType::MAX, stream) + self.unchecked_scalar_minmax_async(ct, scalar, ComparisonType::MAX, streams) } - pub fn unchecked_scalar_max(&self, ct: &T, scalar: Scalar, stream: &CudaStreams) -> T + pub fn unchecked_scalar_max( + &self, + ct: &T, + scalar: Scalar, + streams: &CudaStreams, + ) -> T where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { - let result = unsafe { self.unchecked_scalar_max_async(ct, scalar, stream) }; - stream.synchronize(); + let result = unsafe { self.unchecked_scalar_max_async(ct, scalar, streams) }; + streams.synchronize(); result } /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn unchecked_scalar_min_async( &self, ct: &T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) -> T where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { - self.unchecked_scalar_minmax_async(ct, scalar, ComparisonType::MIN, stream) + self.unchecked_scalar_minmax_async(ct, scalar, ComparisonType::MIN, streams) } - pub fn unchecked_scalar_min(&self, ct: &T, scalar: Scalar, stream: &CudaStreams) -> T + pub fn unchecked_scalar_min( + &self, + ct: &T, + scalar: Scalar, + streams: &CudaStreams, + ) -> T where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { - let result = unsafe { self.unchecked_scalar_min_async(ct, scalar, stream) }; - stream.synchronize(); + let result = unsafe { self.unchecked_scalar_min_async(ct, scalar, streams) }; + streams.synchronize(); result } /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn scalar_max_async( &self, ct: &T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) -> T where Scalar: DecomposableInto, @@ -987,33 +999,33 @@ impl CudaServerKey { let lhs = if ct.block_carries_are_empty() { ct } else { - tmp_lhs = ct.duplicate_async(stream); - self.full_propagate_assign_async(&mut tmp_lhs, stream); + tmp_lhs = ct.duplicate_async(streams); + self.full_propagate_assign_async(&mut tmp_lhs, streams); &tmp_lhs }; - self.unchecked_scalar_max_async(lhs, scalar, stream) + self.unchecked_scalar_max_async(lhs, scalar, streams) } - pub fn scalar_max(&self, ct: &T, scalar: Scalar, stream: &CudaStreams) -> T + pub fn scalar_max(&self, ct: &T, scalar: Scalar, streams: &CudaStreams) -> T where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { - let result = unsafe { self.scalar_max_async(ct, scalar, stream) }; - stream.synchronize(); + let result = unsafe { self.scalar_max_async(ct, scalar, streams) }; + streams.synchronize(); result } /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn scalar_min_async( &self, ct: &T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) -> T where Scalar: DecomposableInto, @@ -1023,21 +1035,21 @@ impl CudaServerKey { let lhs = if ct.block_carries_are_empty() { ct } else { - tmp_lhs = ct.duplicate_async(stream); - self.full_propagate_assign_async(&mut tmp_lhs, stream); + tmp_lhs = ct.duplicate_async(streams); + self.full_propagate_assign_async(&mut tmp_lhs, streams); &tmp_lhs }; - self.unchecked_scalar_min_async(lhs, scalar, stream) + self.unchecked_scalar_min_async(lhs, scalar, streams) } - pub fn scalar_min(&self, ct: &T, scalar: Scalar, stream: &CudaStreams) -> T + pub fn scalar_min(&self, ct: &T, scalar: Scalar, streams: &CudaStreams) -> T where Scalar: DecomposableInto, T: CudaIntegerRadixCiphertext, { - let result = unsafe { self.scalar_min_async(ct, scalar, stream) }; - stream.synchronize(); + let result = unsafe { self.scalar_min_async(ct, scalar, streams) }; + streams.synchronize(); result } } diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs index 9bab89c008..8d23090fd9 100644 --- a/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs +++ b/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs @@ -26,50 +26,59 @@ impl CudaServerKey { /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS; /// /// let gpu_index = 0; - /// let mut stream = CudaStreams::new_single_gpu(gpu_index); + /// let mut streams = CudaStreams::new_single_gpu(gpu_index); /// /// // We have 4 * 2 = 8 bits of message /// let size = 4; - /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream); + /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams); /// /// let msg = 30; /// let scalar = 3; /// /// let ct = cks.encrypt(msg); - /// let mut d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &mut stream); + /// let mut d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &mut streams); /// /// // Compute homomorphically a scalar multiplication: - /// let d_ct_res = sks.unchecked_scalar_mul(&d_ct, scalar, &mut stream); - /// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream); + /// let d_ct_res = sks.unchecked_scalar_mul(&d_ct, scalar, &mut streams); + /// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams); /// /// let clear: u64 = cks.decrypt(&ct_res); /// assert_eq!(scalar * msg, clear); /// ``` - pub fn unchecked_scalar_mul(&self, ct: &T, scalar: Scalar, stream: &CudaStreams) -> T + pub fn unchecked_scalar_mul( + &self, + ct: &T, + scalar: Scalar, + streams: &CudaStreams, + ) -> T where Scalar: ScalarMultiplier + DecomposableInto + CastInto, T: CudaIntegerRadixCiphertext, { - let mut result = unsafe { ct.duplicate_async(stream) }; - self.unchecked_scalar_mul_assign(&mut result, scalar, stream); + let mut result = unsafe { ct.duplicate_async(streams) }; + self.unchecked_scalar_mul_assign(&mut result, scalar, streams); result } /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn unchecked_scalar_mul_assign_async( &self, ct: &mut T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) where Scalar: ScalarMultiplier + DecomposableInto + CastInto, T: CudaIntegerRadixCiphertext, { if scalar == Scalar::ZERO { - ct.as_mut().d_blocks.0.d_vec.memset_async(0, stream, 0); + ct.as_mut() + .d_blocks + .0 + .d_vec + .memset_async(0, streams, streams.gpu_indexes[0]); return; } @@ -80,7 +89,7 @@ impl CudaServerKey { if scalar.is_power_of_two() { // Shifting cost one bivariate PBS so its always faster // than multiplying - self.unchecked_scalar_left_shift_assign_async(ct, scalar.ilog2() as u64, stream); + self.unchecked_scalar_left_shift_assign_async(ct, scalar.ilog2() as u64, streams); return; } let ciphertext = ct.as_mut(); @@ -104,7 +113,7 @@ impl CudaServerKey { match &self.bootstrapping_key { CudaBootstrappingKey::Classic(d_bsk) => { unchecked_scalar_mul_integer_radix_kb_async( - stream, + streams, &mut ct.as_mut().d_blocks.0.d_vec, decomposed_scalar.as_slice(), has_at_least_one_set.as_slice(), @@ -129,7 +138,7 @@ impl CudaServerKey { } CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { unchecked_scalar_mul_integer_radix_kb_async( - stream, + streams, &mut ct.as_mut().d_blocks.0.d_vec, decomposed_scalar.as_slice(), has_at_least_one_set.as_slice(), @@ -161,15 +170,15 @@ impl CudaServerKey { &self, ct: &mut T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) where Scalar: ScalarMultiplier + DecomposableInto + CastInto, T: CudaIntegerRadixCiphertext, { unsafe { - self.unchecked_scalar_mul_assign_async(ct, scalar, stream); + self.unchecked_scalar_mul_assign_async(ct, scalar, streams); } - stream.synchronize(); + streams.synchronize(); } /// Computes homomorphically a multiplication between a scalar and a ciphertext. @@ -189,63 +198,63 @@ impl CudaServerKey { /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS; /// /// let gpu_index = 0; - /// let mut stream = CudaStreams::new_single_gpu(gpu_index); + /// let mut streams = CudaStreams::new_single_gpu(gpu_index); /// /// // We have 4 * 2 = 8 bits of message /// let size = 4; - /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream); + /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams); /// /// let msg = 30; /// let scalar = 3; /// /// let ct = cks.encrypt(msg); - /// let mut d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &mut stream); + /// let mut d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &mut streams); /// /// // Compute homomorphically a scalar multiplication: - /// let d_ct_res = sks.scalar_mul(&d_ct, scalar, &mut stream); - /// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream); + /// let d_ct_res = sks.scalar_mul(&d_ct, scalar, &mut streams); + /// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams); /// /// let clear: u64 = cks.decrypt(&ct_res); /// assert_eq!(scalar * msg, clear); /// ``` - pub fn scalar_mul(&self, ct: &T, scalar: Scalar, stream: &CudaStreams) -> T + pub fn scalar_mul(&self, ct: &T, scalar: Scalar, streams: &CudaStreams) -> T where Scalar: ScalarMultiplier + DecomposableInto + CastInto, T: CudaIntegerRadixCiphertext, { - let mut result = unsafe { ct.duplicate_async(stream) }; - self.scalar_mul_assign(&mut result, scalar, stream); + let mut result = unsafe { ct.duplicate_async(streams) }; + self.scalar_mul_assign(&mut result, scalar, streams); result } /// # Safety /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronised pub unsafe fn scalar_mul_assign_async( &self, ct: &mut T, scalar: Scalar, - stream: &CudaStreams, + streams: &CudaStreams, ) where Scalar: ScalarMultiplier + DecomposableInto + CastInto, T: CudaIntegerRadixCiphertext, { if !ct.block_carries_are_empty() { - self.full_propagate_assign_async(ct, stream); + self.full_propagate_assign_async(ct, streams); }; - self.unchecked_scalar_mul_assign_async(ct, scalar, stream); + self.unchecked_scalar_mul_assign_async(ct, scalar, streams); } - pub fn scalar_mul_assign(&self, ct: &mut T, scalar: Scalar, stream: &CudaStreams) + pub fn scalar_mul_assign(&self, ct: &mut T, scalar: Scalar, streams: &CudaStreams) where Scalar: ScalarMultiplier + DecomposableInto + CastInto, T: CudaIntegerRadixCiphertext, { unsafe { - self.scalar_mul_assign_async(ct, scalar, stream); + self.scalar_mul_assign_async(ct, scalar, streams); } - stream.synchronize(); + streams.synchronize(); } }