diff --git a/src/evaluate.cpp b/src/evaluate.cpp index f5746ca5199..5cdedc6bcda 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -60,7 +60,7 @@ Value Eval::evaluate(const Eval::NNUE::Networks& networks, int nnueComplexity; int v; - Value nnue = smallNet ? networks.small.evaluate(pos, nullptr, true, &nnueComplexity, psqtOnly) + Value nnue = smallNet ? networks.small.evaluate(pos, &caches.small, true, &nnueComplexity, psqtOnly) : networks.big.evaluate(pos, &caches.big, true, &nnueComplexity, false); const auto adjustEval = [&](int optDiv, int nnueDiv, int npmDiv, int pawnCountConstant, diff --git a/src/nnue/network.cpp b/src/nnue/network.cpp index 656ad97a1e3..8bb11595702 100644 --- a/src/nnue/network.cpp +++ b/src/nnue/network.cpp @@ -259,8 +259,8 @@ void Network::verify(std::string evalfilePath) const { template void Network::hint_common_access(const Position& pos, AccumulatorCaches::Cache* cache, - bool psqtOnl) const { - featureTransformer->hint_common_access(pos, cache, psqtOnl); + bool psqtOnly) const { + featureTransformer->hint_common_access(pos, cache, psqtOnly); } template diff --git a/src/nnue/network.h b/src/nnue/network.h index df59732d955..053b7d19c82 100644 --- a/src/nnue/network.h +++ b/src/nnue/network.h @@ -62,7 +62,7 @@ class Network { void hint_common_access(const Position& pos, AccumulatorCaches::Cache* cache, - bool psqtOnl) const; + bool psqtOnly) const; void verify(std::string evalfilePath) const; NnueEvalTrace trace_evaluate(const Position& pos, diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h index 8d73dbef5ad..709a07a8f46 100644 --- a/src/nnue/nnue_accumulator.h +++ b/src/nnue/nnue_accumulator.h @@ -58,6 +58,7 @@ struct AccumulatorCaches { PSQTWeightType psqtAccumulation[COLOR_NB][PSQTBuckets]; Bitboard byColorBB[COLOR_NB][COLOR_NB]; Bitboard byTypeBB[COLOR_NB][PIECE_TYPE_NB]; + bool psqtOnly; // To initialize a refresh entry, we set all its bitboards empty, // so we put the biases in the accumulation, without any weights on top @@ -65,6 +66,7 @@ struct AccumulatorCaches { std::memset(byColorBB, 0, sizeof(byColorBB)); std::memset(byTypeBB, 0, sizeof(byTypeBB)); + psqtOnly = false; std::memcpy(accumulation[WHITE], biases, Size * sizeof(BiasType)); std::memcpy(accumulation[BLACK], biases, Size * sizeof(BiasType)); @@ -92,11 +94,11 @@ struct AccumulatorCaches { template void clear(const Networks& networks) { big.clear(networks.big); + small.clear(networks.small); } - // When adding a new cache for a network, i.e. the smallnet - // the appropriate condition must be added to FeatureTransformer::update_accumulator_refresh. Cache big; + Cache small; }; } // namespace Stockfish::Eval::NNUE diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 88f0e4031a4..531bd0c7961 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -656,75 +656,97 @@ class FeatureTransformer { template void update_accumulator_refresh_cache(const Position& pos, - AccumulatorCaches::Cache* cache) const { + AccumulatorCaches::Cache* cache, + bool psqtOnly) const { assert(cache != nullptr); Square ksq = pos.square(Perspective); - auto& entry = (*cache)[ksq]; - - auto& accumulator = pos.state()->*accPtr; - accumulator.computed[Perspective] = true; - accumulator.computedPSQT[Perspective] = true; - FeatureSet::IndexList removed, added; - for (Color c : {WHITE, BLACK}) + + if (entry.psqtOnly && !psqtOnly) + { + entry.clear(biases); + FeatureSet::append_active_indices(pos, added); + } + else { - for (PieceType pt = PAWN; pt <= KING; ++pt) + for (Color c : {WHITE, BLACK}) { - const Piece piece = make_piece(c, pt); - const Bitboard oldBB = - entry.byColorBB[Perspective][c] & entry.byTypeBB[Perspective][pt]; - const Bitboard newBB = pos.pieces(c, pt); - Bitboard toRemove = oldBB & ~newBB; - Bitboard toAdd = newBB & ~oldBB; - - while (toRemove) + for (PieceType pt = PAWN; pt <= KING; ++pt) { - Square sq = pop_lsb(toRemove); - removed.push_back(FeatureSet::make_index(sq, piece, ksq)); - } - while (toAdd) - { - Square sq = pop_lsb(toAdd); - added.push_back(FeatureSet::make_index(sq, piece, ksq)); + const Piece piece = make_piece(c, pt); + const Bitboard oldBB = + entry.byColorBB[Perspective][c] & entry.byTypeBB[Perspective][pt]; + const Bitboard newBB = pos.pieces(c, pt); + Bitboard toRemove = oldBB & ~newBB; + Bitboard toAdd = newBB & ~oldBB; + + while (toRemove) + { + Square sq = pop_lsb(toRemove); + removed.push_back(FeatureSet::make_index(sq, piece, ksq)); + } + while (toAdd) + { + Square sq = pop_lsb(toAdd); + added.push_back(FeatureSet::make_index(sq, piece, ksq)); + } } } } + auto& accumulator = pos.state()->*accPtr; + accumulator.computed[Perspective] = !psqtOnly; + accumulator.computedPSQT[Perspective] = true; + #ifdef VECTOR vec_t acc[NumRegs]; psqt_vec_t psqt[NumPsqtRegs]; - for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j) - { - auto entryTile = - reinterpret_cast(&entry.accumulation[Perspective][j * TileHeight]); - for (IndexType k = 0; k < NumRegs; ++k) - acc[k] = entryTile[k]; - - for (int i = 0; i < int(added.size()); ++i) + if (!psqtOnly) + for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j) { - IndexType index = added[i]; - const IndexType offset = HalfDimensions * index + j * TileHeight; - auto column = reinterpret_cast(&weights[offset]); + auto entryTile = + reinterpret_cast(&entry.accumulation[Perspective][j * TileHeight]); + for (IndexType k = 0; k < NumRegs; ++k) + acc[k] = entryTile[k]; - for (unsigned k = 0; k < NumRegs; ++k) - acc[k] = vec_add_16(acc[k], column[k]); - } - for (int i = 0; i < int(removed.size()); ++i) - { - IndexType index = removed[i]; - const IndexType offset = HalfDimensions * index + j * TileHeight; - auto column = reinterpret_cast(&weights[offset]); + int i0 = 0; + for (; i0 < int(std::min(removed.size(), added.size())); ++i0) + { + IndexType indexR = removed[i0]; + const IndexType offsetR = HalfDimensions * indexR + j * TileHeight; + auto columnR = reinterpret_cast(&weights[offsetR]); + IndexType indexA = added[i0]; + const IndexType offsetA = HalfDimensions * indexA + j * TileHeight; + auto columnA = reinterpret_cast(&weights[offsetA]); - for (unsigned k = 0; k < NumRegs; ++k) - acc[k] = vec_sub_16(acc[k], column[k]); - } + for (unsigned k = 0; k < NumRegs; ++k) + acc[k] = vec_add_16(vec_sub_16(acc[k], columnR[k]), columnA[k]); + } + for (int i = i0; i < int(removed.size()); ++i) + { + IndexType index = removed[i]; + const IndexType offset = HalfDimensions * index + j * TileHeight; + auto column = reinterpret_cast(&weights[offset]); - for (IndexType k = 0; k < NumRegs; k++) - vec_store(&entryTile[k], acc[k]); - } + for (unsigned k = 0; k < NumRegs; ++k) + acc[k] = vec_sub_16(acc[k], column[k]); + } + for (int i = i0; i < int(added.size()); ++i) + { + IndexType index = added[i]; + const IndexType offset = HalfDimensions * index + j * TileHeight; + auto column = reinterpret_cast(&weights[offset]); + + for (unsigned k = 0; k < NumRegs; ++k) + acc[k] = vec_add_16(acc[k], column[k]); + } + + for (IndexType k = 0; k < NumRegs; k++) + vec_store(&entryTile[k], acc[k]); + } for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j) { @@ -733,23 +755,23 @@ class FeatureTransformer { for (std::size_t k = 0; k < NumPsqtRegs; ++k) psqt[k] = entryTilePsqt[k]; - for (int i = 0; i < int(added.size()); ++i) + for (int i = 0; i < int(removed.size()); ++i) { - IndexType index = added[i]; + IndexType index = removed[i]; const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight; auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); for (std::size_t k = 0; k < NumPsqtRegs; ++k) - psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]); + psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]); } - for (int i = 0; i < int(removed.size()); ++i) + for (int i = 0; i < int(added.size()); ++i) { - IndexType index = removed[i]; + IndexType index = added[i]; const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight; auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); for (std::size_t k = 0; k < NumPsqtRegs; ++k) - psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]); + psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]); } for (std::size_t k = 0; k < NumPsqtRegs; ++k) @@ -758,23 +780,29 @@ class FeatureTransformer { #else - for (const auto index : added) + for (const auto index : removed) { - const IndexType offset = HalfDimensions * index; - for (IndexType j = 0; j < HalfDimensions; ++j) - entry.accumulation[Perspective][j] += weights[offset + j]; + if (!psqtOnly) + { + const IndexType offset = HalfDimensions * index; + for (IndexType j = 0; j < HalfDimensions; ++j) + entry.accumulation[Perspective][j] -= weights[offset + j]; + } for (std::size_t k = 0; k < PSQTBuckets; ++k) - entry.psqtAccumulation[Perspective][k] += psqtWeights[index * PSQTBuckets + k]; + entry.psqtAccumulation[Perspective][k] -= psqtWeights[index * PSQTBuckets + k]; } - for (const auto index : removed) + for (const auto index : added) { - const IndexType offset = HalfDimensions * index; - for (IndexType j = 0; j < HalfDimensions; ++j) - entry.accumulation[Perspective][j] -= weights[offset + j]; + if (!psqtOnly) + { + const IndexType offset = HalfDimensions * index; + for (IndexType j = 0; j < HalfDimensions; ++j) + entry.accumulation[Perspective][j] += weights[offset + j]; + } for (std::size_t k = 0; k < PSQTBuckets; ++k) - entry.psqtAccumulation[Perspective][k] -= psqtWeights[index * PSQTBuckets + k]; + entry.psqtAccumulation[Perspective][k] += psqtWeights[index * PSQTBuckets + k]; } #endif @@ -782,144 +810,20 @@ class FeatureTransformer { // The accumulator of the refresh entry has been updated. // Now copy its content to the actual accumulator we were refreshing + if (!psqtOnly) + std::memcpy(accumulator.accumulation[Perspective], entry.accumulation[Perspective], + sizeof(BiasType) * HalfDimensions); + std::memcpy(accumulator.psqtAccumulation[Perspective], entry.psqtAccumulation[Perspective], sizeof(int32_t) * PSQTBuckets); - std::memcpy(accumulator.accumulation[Perspective], entry.accumulation[Perspective], - sizeof(BiasType) * HalfDimensions); - for (Color c : {WHITE, BLACK}) entry.byColorBB[Perspective][c] = pos.pieces(c); for (PieceType pt = PAWN; pt <= KING; ++pt) entry.byTypeBB[Perspective][pt] = pos.pieces(pt); - } - - template - void - update_accumulator_refresh(const Position& pos, - [[maybe_unused]] AccumulatorCaches::Cache* cache, - bool psqtOnly) const { - - // When we are refreshing the accumulator of the big net, - // redirect to the version of refresh that uses the refresh table. - // Using the cache for the small net is not beneficial. - if constexpr (HalfDimensions == Eval::NNUE::TransformedFeatureDimensionsBig) - { - update_accumulator_refresh_cache(pos, cache); - return; - } - -#ifdef VECTOR - // Gcc-10.2 unnecessarily spills AVX2 registers if this array - // is defined in the VECTOR code below, once in each branch - vec_t acc[NumRegs]; - psqt_vec_t psqt[NumPsqtRegs]; -#endif - - // Refresh the accumulator - // Could be extracted to a separate function because it's done in 2 places, - // but it's unclear if compilers would correctly handle register allocation. - auto& accumulator = pos.state()->*accPtr; - accumulator.computed[Perspective] = !psqtOnly; - accumulator.computedPSQT[Perspective] = true; - FeatureSet::IndexList active; - FeatureSet::append_active_indices(pos, active); - -#ifdef VECTOR - if (!psqtOnly) - for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j) - { - auto biasesTile = reinterpret_cast(&biases[j * TileHeight]); - for (IndexType k = 0; k < NumRegs; ++k) - acc[k] = biasesTile[k]; - - int i = 0; - for (; i < int(active.size()) - 1; i += 2) - { - IndexType index0 = active[i]; - IndexType index1 = active[i + 1]; - const IndexType offset0 = HalfDimensions * index0 + j * TileHeight; - const IndexType offset1 = HalfDimensions * index1 + j * TileHeight; - auto column0 = reinterpret_cast(&weights[offset0]); - auto column1 = reinterpret_cast(&weights[offset1]); - - for (unsigned k = 0; k < NumRegs; ++k) - acc[k] = vec_add_16(acc[k], vec_add_16(column0[k], column1[k])); - } - for (; i < int(active.size()); ++i) - { - IndexType index = active[i]; - const IndexType offset = HalfDimensions * index + j * TileHeight; - auto column = reinterpret_cast(&weights[offset]); - - for (unsigned k = 0; k < NumRegs; ++k) - acc[k] = vec_add_16(acc[k], column[k]); - } - - auto accTile = - reinterpret_cast(&accumulator.accumulation[Perspective][j * TileHeight]); - for (unsigned k = 0; k < NumRegs; k++) - vec_store(&accTile[k], acc[k]); - } - - for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j) - { - for (std::size_t k = 0; k < NumPsqtRegs; ++k) - psqt[k] = vec_zero_psqt(); - - int i = 0; - for (; i < int(active.size()) - 1; i += 2) - { - IndexType index0 = active[i]; - IndexType index1 = active[i + 1]; - const IndexType offset0 = PSQTBuckets * index0 + j * PsqtTileHeight; - const IndexType offset1 = PSQTBuckets * index1 + j * PsqtTileHeight; - auto columnPsqt0 = reinterpret_cast(&psqtWeights[offset0]); - auto columnPsqt1 = reinterpret_cast(&psqtWeights[offset1]); - - for (std::size_t k = 0; k < NumPsqtRegs; ++k) - psqt[k] = - vec_add_psqt_32(psqt[k], vec_add_psqt_32(columnPsqt0[k], columnPsqt1[k])); - } - for (; i < int(active.size()); ++i) - { - IndexType index = active[i]; - const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight; - auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); - - for (std::size_t k = 0; k < NumPsqtRegs; ++k) - psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]); - } - - auto accTilePsqt = reinterpret_cast( - &accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]); - for (std::size_t k = 0; k < NumPsqtRegs; ++k) - vec_store_psqt(&accTilePsqt[k], psqt[k]); - } -#else - if (!psqtOnly) - std::memcpy(accumulator.accumulation[Perspective], biases, - HalfDimensions * sizeof(BiasType)); - - for (std::size_t k = 0; k < PSQTBuckets; ++k) - accumulator.psqtAccumulation[Perspective][k] = 0; - - for (const auto index : active) - { - if (!psqtOnly) - { - const IndexType offset = HalfDimensions * index; - for (IndexType j = 0; j < HalfDimensions; ++j) - accumulator.accumulation[Perspective][j] += weights[offset + j]; - } - - for (std::size_t k = 0; k < PSQTBuckets; ++k) - accumulator.psqtAccumulation[Perspective][k] += - psqtWeights[index * PSQTBuckets + k]; - } -#endif + entry.psqtOnly = psqtOnly; } template @@ -948,7 +852,7 @@ class FeatureTransformer { psqtOnly); } else - update_accumulator_refresh(pos, cache, psqtOnly); + update_accumulator_refresh_cache(pos, cache, psqtOnly); } template @@ -976,7 +880,7 @@ class FeatureTransformer { psqtOnly); } else - update_accumulator_refresh(pos, cache, psqtOnly); + update_accumulator_refresh_cache(pos, cache, psqtOnly); } template diff --git a/src/nnue/nnue_misc.cpp b/src/nnue/nnue_misc.cpp index 51838fefa44..e92dcc71086 100644 --- a/src/nnue/nnue_misc.cpp +++ b/src/nnue/nnue_misc.cpp @@ -48,7 +48,7 @@ void hint_common_parent_position(const Position& pos, int simpleEvalAbs = std::abs(simple_eval(pos, pos.side_to_move())); if (simpleEvalAbs > Eval::SmallNetThreshold) - networks.small.hint_common_access(pos, nullptr, simpleEvalAbs > Eval::PsqtOnlyThreshold); + networks.small.hint_common_access(pos, &caches.small, simpleEvalAbs > Eval::PsqtOnlyThreshold); else networks.big.hint_common_access(pos, &caches.big, false); }