Skip to content

Commit

Permalink
A configuration for the max size of a set
Browse files Browse the repository at this point in the history
Summary:
Sets in a query can grow to an arbitrary size. So in order to protect the server, we want to have a max size so that we can fail in a controlled manner, rather than e.g. OOM.

See D67517993 for the default server config

Reviewed By: donsbot

Differential Revision: D68108359

fbshipit-source-id: f337ed8650296aefdd3113bfa4db93f264f7cd0f
  • Loading branch information
Josef Svenningsson authored and facebook-github-bot committed Jan 13, 2025
1 parent 8c1ae4e commit dcc1226
Show file tree
Hide file tree
Showing 11 changed files with 155 additions and 35 deletions.
1 change: 1 addition & 0 deletions glean/db/Glean/Query/UserQuery.hs
Original file line number Diff line number Diff line change
Expand Up @@ -1030,6 +1030,7 @@ mkQueryRuntimeOptions
<|> config_default_max_bytes -- from ServerConfig
, queryMaxTimeMs = userQueryOptions_max_time_ms
<|> config_default_max_time_ms -- from ServerConfig
, queryMaxSetSize = config_max_set_size_bytes
, queryWantStats = userQueryOptions_collect_facts_searched
, queryDepth = if
| userQueryOptions_recursive && not userQueryOptions_omit_results ->
Expand Down
7 changes: 7 additions & 0 deletions glean/hs/Glean/RTS/Foreign/Query.hsc
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ data QueryRuntimeOptions = QueryRuntimeOptions
{ queryMaxResults :: Maybe Int64
, queryMaxBytes :: Maybe Int64
, queryMaxTimeMs :: Maybe Int64
, queryMaxSetSize :: Maybe Int64
, queryDepth :: Depth
, queryWantStats :: Bool
}
Expand Down Expand Up @@ -106,6 +107,7 @@ executeCompiled inventory ownership facts
maxr = fromIntegral (fromMaybe 0 queryMaxResults)
maxb = fromIntegral (fromMaybe 0 queryMaxBytes)
maxt = fromIntegral (fromMaybe 0 queryMaxTimeMs)
maxs = fromIntegral (fromMaybe 0 queryMaxSetSize)
withTraversal = case compiledQueryResultTraversal of
Nothing -> ($ nullPtr)
Just sub -> with sub
Expand All @@ -123,6 +125,7 @@ executeCompiled inventory ownership facts
maxr
maxb
maxt
maxs
depth
expand_pids
num_expand_pids
Expand All @@ -149,6 +152,7 @@ restartCompiled inventory ownership facts pid
maxr = fromIntegral (fromMaybe 0 queryMaxResults)
maxb = fromIntegral (fromMaybe 0 queryMaxBytes)
maxt = fromIntegral (fromMaybe 0 queryMaxTimeMs)
maxs = fromIntegral (fromMaybe 0 queryMaxSetSize)
in
withDepth queryDepth $ \(depth, expand_pids, num_expand_pids) ->
using
Expand All @@ -161,6 +165,7 @@ restartCompiled inventory ownership facts pid
maxr
maxb
maxt
maxs
depth
expand_pids
num_expand_pids
Expand Down Expand Up @@ -268,6 +273,7 @@ foreign import ccall safe glean_query_execute_compiled
-> Word64 -- max_results
-> Word64 -- max_bytes
-> Word64 -- max_time_ms
-> Word64 -- max_set_size
-> Word64 -- depth
-> Ptr Word64 -- expand_pids
-> Word64 -- num_expand_pids
Expand All @@ -284,6 +290,7 @@ foreign import ccall safe glean_query_restart_compiled
-> Word64 -- max_results
-> Word64 -- max_bytes
-> Word64 -- max_time_ms
-> Word64 -- max_set_size
-> Word64 -- depth
-> Ptr Word64 -- expand_pids
-> Word64 -- num_expand_pids
Expand Down
6 changes: 6 additions & 0 deletions glean/rts/ffi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ const char* glean_query_execute_compiled(
uint64_t max_results,
uint64_t max_bytes,
uint64_t max_time_ms,
uint64_t max_set_size,
uint64_t depth,
uint64_t* expand_pids,
uint64_t num_expand_pids,
Expand All @@ -253,6 +254,8 @@ const char* glean_query_execute_compiled(
max_bytes == 0 ? folly::none : folly::Optional<uint64_t>(max_bytes),
max_time_ms == 0 ? folly::none
: folly::Optional<uint64_t>(max_time_ms),
max_set_size == 0 ? folly::none
: folly::Optional<uint64_t>(max_set_size),
static_cast<Depth>(depth),
expandPids,
want_stats)
Expand All @@ -269,6 +272,7 @@ const char* glean_query_restart_compiled(
uint64_t max_results,
uint64_t max_bytes,
uint64_t max_time_ms,
uint64_t max_set_size,
uint64_t depth,
uint64_t* expand_pids,
uint64_t num_expand_pids,
Expand All @@ -291,6 +295,8 @@ const char* glean_query_restart_compiled(
max_bytes == 0 ? folly::none : folly::Optional<uint64_t>(max_bytes),
max_time_ms == 0 ? folly::none
: folly::Optional<uint64_t>(max_time_ms),
max_set_size == 0 ? folly::none
: folly::Optional<uint64_t>(max_set_size),
static_cast<Depth>(depth),
expandPids,
want_stats,
Expand Down
2 changes: 2 additions & 0 deletions glean/rts/ffi.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ const char* glean_query_execute_compiled(
uint64_t max_results,
uint64_t max_bytes,
uint64_t max_time_ms,
uint64_t max_set_size,
uint64_t recursive,
uint64_t* expand_pids,
uint64_t num_expand_pids,
Expand All @@ -189,6 +190,7 @@ const char* glean_query_restart_compiled(
uint64_t max_results,
uint64_t max_bytes,
uint64_t max_time_ms,
uint64_t max_set_size,
uint64_t recursive,
uint64_t* expand_pids,
uint64_t num_expand_pids,
Expand Down
75 changes: 52 additions & 23 deletions glean/rts/query.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,41 @@ struct QueryExecutor : SetOps {
// A live fact iterator
using IterToken = uint64_t;

// iterators
struct Iter {
std::unique_ptr<rts::FactIterator> iter;
// remember the type and current key so that we can capture the
// state of this iterator for a continuation.
Pid type;
Id id;
size_t prefix_size;
bool first;
};

QueryExecutor(
Inventory& inventory,
Define& facts,
DefineOwnership* ownership,
Subroutine& sub,
Pid pid,
std::shared_ptr<Subroutine> traverse,
Depth depth,
std::unordered_set<Pid, folly::hasher<Pid>>& expandPids,
bool wantStats,
std::vector<Iter> iters,
uint64_t max_set_size)
: SetOps(max_set_size),
inventory(inventory),
facts(facts),
ownership(ownership),
sub(sub),
pid(pid),
traverse(traverse),
depth(depth),
expandPids(expandPids),
wantStats(wantStats),
iters(std::move(iters)) {}

//
// Initiate a traversal of facts for a particular predicate and
// key prefix. Returns a token that can be passed to next() to
Expand Down Expand Up @@ -221,17 +256,6 @@ struct QueryExecutor : SetOps {
bool wantStats;
uint64_t result_bytes;

// iterators
struct Iter {
std::unique_ptr<rts::FactIterator> iter;
// remember the type and current key so that we can capture the
// state of this iterator for a continuation.
Pid type;
Id id;
size_t prefix_size;
bool first;
};

std::vector<Iter> iters;
};

Expand Down Expand Up @@ -518,23 +542,24 @@ std::unique_ptr<QueryResults> executeQuery(
folly::Optional<uint64_t> maxResults,
folly::Optional<uint64_t> maxBytes,
folly::Optional<uint64_t> maxTime,
folly::Optional<uint64_t> maxSetSize,
Depth depth,
std::unordered_set<Pid, folly::hasher<Pid>>& expandPids,
bool wantStats,
std::vector<QueryExecutor::Iter> iters,
std::optional<Subroutine::Activation::State> restart) {
QueryExecutor q{
.inventory = inventory,
.facts = facts,
.ownership = ownership,
.sub = sub,
.pid = pid,
.traverse = traverse,
.depth = depth,
.expandPids = expandPids,
.wantStats = wantStats,
.iters = std::move(iters),
};
QueryExecutor q(
inventory,
facts,
ownership,
sub,
pid,
traverse,
depth,
expandPids,
wantStats,
std::move(iters),
maxSetSize ? *maxSetSize : UINT64_MAX);

// coarse_steady_clock is around 1ms granularity which is enough for us.
q.timeout = Clock::now();
Expand Down Expand Up @@ -616,6 +641,7 @@ std::unique_ptr<QueryResults> restartQuery(
folly::Optional<uint64_t> maxResults,
folly::Optional<uint64_t> maxBytes,
folly::Optional<uint64_t> maxTime,
folly::Optional<uint64_t> maxSetSize,
Depth depth,
std::unordered_set<Pid, folly::hasher<Pid>>& expandPids,
bool wantStats,
Expand Down Expand Up @@ -713,6 +739,7 @@ std::unique_ptr<QueryResults> restartQuery(
maxResults,
maxBytes,
maxTime,
maxSetSize,
depth,
expandPids,
wantStats,
Expand All @@ -730,6 +757,7 @@ std::unique_ptr<QueryResults> executeQuery(
folly::Optional<uint64_t> maxResults,
folly::Optional<uint64_t> maxBytes,
folly::Optional<uint64_t> maxTime,
folly::Optional<uint64_t> maxSetSize,
Depth depth,
std::unordered_set<Pid, folly::hasher<Pid>>& expandPids,
bool wantStats) {
Expand All @@ -743,6 +771,7 @@ std::unique_ptr<QueryResults> executeQuery(
maxResults,
maxBytes,
maxTime,
maxSetSize,
depth,
expandPids,
wantStats,
Expand Down
2 changes: 2 additions & 0 deletions glean/rts/query.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ std::unique_ptr<QueryResults> executeQuery(
folly::Optional<uint64_t> maxResults,
folly::Optional<uint64_t> maxBytes,
folly::Optional<uint64_t> maxTime,
folly::Optional<uint64_t> maxSetSize,
Depth depth,
std::unordered_set<Pid, folly::hasher<Pid>>& expandPids,
bool wantStats);
Expand All @@ -62,6 +63,7 @@ std::unique_ptr<QueryResults> restartQuery(
folly::Optional<uint64_t> maxResults,
folly::Optional<uint64_t> maxBytes,
folly::Optional<uint64_t> maxTime,
folly::Optional<uint64_t> maxSetSize,
Depth depth,
std::unordered_set<Pid, folly::hasher<Pid>>& expandPids,
bool wantStats,
Expand Down
44 changes: 40 additions & 4 deletions glean/rts/set.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,31 @@
#include "glean/rts/binary.h"
#include "glean/rts/set.h"

#include <stdexcept>

namespace facebook {
namespace glean {
namespace rts {

SetOps::SetToken SetOps::newSet() {
sets.emplace_back(BytestringSet());
set_sizes.emplace_back(0);
return sets.size() - 1;
}

void SetOps::insertOutputSet(SetOps::SetToken token, binary::Output* out) {
sets[token].insert(out->moveToFbString());
auto& s = sets[token];
auto size = out->size();
if (set_sizes[token] + size <= max_set_size) {
s.insert(out->moveToFbString());
set_sizes[token] += size;
} else {
throw std::overflow_error(folly::sformat(
"Set size limit exceeded for standard set. Set token: {}. Max size: {}. Size: {}",
token,
max_set_size,
set_sizes[token] + size));
}
}

void SetOps::setToArray(SetOps::SetToken token, binary::Output* out) {
Expand All @@ -37,24 +51,45 @@ void SetOps::setToArray(SetOps::SetToken token, binary::Output* out) {

void SetOps::freeSet(SetOps::SetToken token) {
sets.erase(sets.begin() + token);
set_sizes.erase(set_sizes.begin() + token);
}

SetOps::SetToken SetOps::newWordSet() {
wordsets.emplace_back(WordSet());
wordset_sizes.emplace_back(0);
return wordsets.size() - 1;
}

void SetOps::insertWordSet(SetOps::SetToken token, uint64_t value) {
wordsets[token].insert(value);
auto& set = wordsets[token];
if (set.size() + sizeof(uint64_t) <= max_set_size) {
set.insert(value);
wordset_sizes[token] += sizeof(uint64_t);
} else {
rts::error(
"Set size limit exceeded for nat set. Set token: {}. Max size: {}. Size: {}",
token,
max_set_size,
set.size() + sizeof(uint64_t));
}
}

void SetOps::insertBytesWordSet(
SetToken token,
const unsigned char* start,
const unsigned char* end) {
auto& set = wordsets[token];
for (const unsigned char* p = start; p < end; p++) {
set.insert(*p);
if (set.size() + (end - start) <= max_set_size) {
for (const unsigned char* p = start; p < end; p++) {
set.insert(*p);
}
wordset_sizes[token] += end - start;
} else {
rts::error(
"Set size limit exceeded for byte set. Set token: {}. Max size: {}. Size: {}",
token,
max_set_size,
set.size() + (end - start));
}
}

Expand All @@ -76,6 +111,7 @@ void SetOps::byteSetToByteArray(SetOps::SetToken token, binary::Output* out) {

void SetOps::freeWordSet(SetOps::SetToken token) {
wordsets.erase(wordsets.begin() + token);
wordset_sizes.erase(wordset_sizes.begin() + token);
}

} // namespace rts
Expand Down
6 changes: 6 additions & 0 deletions glean/rts/set.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ using WordSet = std::set<uint64_t>;
struct SetOps {
using SetToken = uint64_t;

explicit SetOps(uint64_t max_set_size = UINT64_MAX)
: max_set_size(max_set_size) {}

SetToken newSet();

void insertOutputSet(SetToken token, binary::Output* out);
Expand All @@ -57,6 +60,9 @@ struct SetOps {

std::vector<BytestringSet> sets = {};
std::vector<WordSet> wordsets = {};
std::vector<uint64_t> set_sizes = {};
std::vector<uint64_t> wordset_sizes = {};
uint64_t max_set_size = UINT64_MAX;
};

} // namespace rts
Expand Down
7 changes: 7 additions & 0 deletions glean/test/lib/Glean/Database/Test.hs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ module Glean.Database.Test
, setMemoryStorage
, setDBVersion
, setCompactOnCompletion
, setMaxSetSize
, withTestEnv
, kickOffTestDB
, waitUntilComplete
Expand All @@ -29,6 +30,7 @@ module Glean.Database.Test
import Util.STM
import Data.Default
import Data.Functor
import Data.Int
import Data.List (foldl')
import Data.Map (Map)
import qualified Data.Map as Map
Expand Down Expand Up @@ -98,6 +100,11 @@ setCompactOnCompletion cfg = cfg
{ cfgServerConfig = cfgServerConfig cfg <&> \scfg -> scfg
{ ServerConfig.config_compact_on_completion = True } }

setMaxSetSize :: Int64 -> Setting
setMaxSetSize i cfg = cfg
{ cfgServerConfig = cfgServerConfig cfg <&> \scfg -> scfg
{ ServerConfig.config_max_set_size_bytes = Just i } }

withTestEnv
:: [Setting]
-> (Env -> IO a)
Expand Down
Loading

0 comments on commit dcc1226

Please sign in to comment.