Skip to content

Commit

Permalink
Autodisable broken instances (#2266)
Browse files Browse the repository at this point in the history
  • Loading branch information
yngvar-antonsson authored Aug 14, 2024
1 parent a2db7e2 commit 9439c74
Show file tree
Hide file tree
Showing 5 changed files with 165 additions and 7 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ Added

- More logging cartridge options on start.

- New option ``TARANTOOL_DISABLE_UNRECOVERABLE_INSTANCES`` to automatically disable
instances with state ``InitError`` or ``BootError``.

-------------------------------------------------------------------------------
[2.12.1] - 2024-06-06
-------------------------------------------------------------------------------
Expand Down
10 changes: 10 additions & 0 deletions cartridge.lua
Original file line number Diff line number Diff line change
Expand Up @@ -872,6 +872,16 @@ local function cfg(opts, box_opts)

issues.set_limits(issue_limits)

local res, err = argparse.get_opts({
disable_unrecoverable_instances = 'boolean',
})

if err ~= nil then
return nil, err
end

issues.disable_unrecoverable(res.disable_unrecoverable_instances)

if opts.upload_prefix ~= nil then
local path = opts.upload_prefix
if not path:startswith('/') then
Expand Down
53 changes: 46 additions & 7 deletions cartridge/issues.lua
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,10 @@
--
-- * critical: "Disk error on instance ... ".
--
-- Disabled instances:
--
-- * warning: "Instance had Error and was disabled"
--
-- Custom issues (defined by user):
--
-- * Custom roles can announce more issues with their own level, topic
Expand Down Expand Up @@ -149,6 +153,7 @@ local limits_ranges = {
}

vars:new('limits', default_limits)
vars:new('disable_unrecoverable', false)
vars:new('instance_uuid')
vars:new('replicaset_uuid')

Expand Down Expand Up @@ -694,8 +699,8 @@ local function list_on_cluster()
end
end

-- Check aliens in membership

-- Check aliens in membership and unrecoverable instances
local unrecoverable_uuids = {}
for uri, member in membership.pairs() do
local uuid = member.payload.uuid
if member.status == 'alive'
Expand All @@ -711,8 +716,35 @@ local function list_on_cluster()
)
})
end
end
local state = member.payload.state
if vars.disable_unrecoverable
and (state == 'InitError' or state == 'BootError')
then
if uuid == nil then
for k, v in pairs(topology_cfg.servers) do
if v.uri == uri then
uuid = k
goto uuid_found
end
end
end

::uuid_found::
if uuid ~= nil then -- still no uuid, skipping
table.insert(unrecoverable_uuids, uuid)
table.insert(ret, {
level = 'warning',
topic = 'autodisable',
instance_uuid = uuid,
message = string.format(
'Instance %s had %s and was disabled',
describe(uri),
state
)
})
end
end
end

-- Get each instance issues (replication, failover, memory usage)

Expand All @@ -728,22 +760,26 @@ local function list_on_cluster()
{uri_list = uri_list, timeout = 1}
)

local disk_failure_uuids = {}
local uuids_to_disable = {}
for _, issues in pairs(issues_map) do
for _, issue in pairs(issues) do
table.insert(ret, issue)
if issue.topic == 'disk_failure' then
table.insert(disk_failure_uuids, issue.instance_uuid)
table.insert(uuids_to_disable, issue.instance_uuid)
disk_failure_cache[issue.instance_uuid] = issue
end
end
end

for _, issue in pairs(disk_failure_cache) do
table.insert(ret, issue)
end

if #disk_failure_uuids > 0 then
lua_api_topology.disable_servers(disk_failure_uuids)
if vars.disable_unrecoverable then
uuids_to_disable = fun.chain(uuids_to_disable, unrecoverable_uuids):totable()
end
if #uuids_to_disable > 0 then
lua_api_topology.disable_servers(uuids_to_disable)
end

-- to use this counter in tarantool/metrics
Expand Down Expand Up @@ -820,4 +856,7 @@ return {
default_limits = default_limits,
validate_limits = validate_limits,
set_limits = set_limits,
disable_unrecoverable = function(disable)
vars.disable_unrecoverable = disable
end,
}
7 changes: 7 additions & 0 deletions rst/cartridge_admin.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1566,6 +1566,13 @@ Cartridge displays cluster and instances issues in WebUI:
instances will be disabled (on instances with vshard, vshard storage will
also be disabled) and you need to fix the disk issue manually.

* Disabled instances:

* **warning**: "Instance had Error and was disabled". Available only when
``TARANTOOL_DISABLE_UNRECOVERABLE_INSTANCES`` is set to true.
When you see this issue, instances will be disabled and you need to fix
the issue manually.

* Custom issues (defined by user):

* Custom roles can announce more issues with their own level, topic
Expand Down
99 changes: 99 additions & 0 deletions test/integration/autodisable_test.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
local fio = require('fio')
local t = require('luatest')
local g = t.group()

local helpers = require('test.helper')

g.before_all(function()
g.cluster = helpers.Cluster:new({
datadir = fio.tempdir(),
server_command = helpers.entrypoint('srv_basic'),
use_vshard = true,
cookie = helpers.random_cookie(),
replicasets = { {
alias = 'router',
roles = {'vshard-router'},
servers = 1,
}, {
alias = 'storage',
roles = {'vshard-storage'},
servers = 2
},
},
env = {
TARANTOOL_DISABLE_UNRECOVERABLE_INSTANCES = 'true',
}
})
g.cluster:start()
end)

g.after_all(function()
g.cluster:stop()
fio.rmtree(g.cluster.datadir)
end)

function g.test_autodisable()
local router = g.cluster.main_server
local storage_1 = g.cluster:server('storage-1')

-- this leads to InitError
fio.rename(storage_1.workdir..'/config', storage_1.workdir..'/config-tmp')

storage_1:restart()

-- check disabled instances
-- only two issue is produced
t.helpers.retrying({}, function()
t.assert_covers(helpers.list_cluster_issues(router), {
{
level = 'warning',
instance_uuid = storage_1.instance_uuid,
topic = 'autodisable',
message = 'Instance localhost:13302 (storage-1) had InitError and was disabled',
},
})
end)

local resp = router:graphql({
query = [[
{
servers {
uri
disabled
}
}
]]
})

table.sort(resp['data']['servers'], function(a, b) return a.uri < b.uri end)

t.assert_items_equals(resp['data']['servers'], {
{
uri = 'localhost:13301',
disabled = false,
},
{
uri = 'localhost:13302',
disabled = true,
},
{
uri = 'localhost:13303',
disabled = false,
},
})

-- restart instance without InitError
fio.rename(storage_1.workdir..'/config-tmp', storage_1.workdir..'/config')
storage_1:restart()

-- enable it back
g.cluster.main_server:graphql({query = ([[
mutation {
cluster { enable_servers(uuids: ["%s"]) { uri } }
}
]]):format(storage_1.instance_uuid)})

t.helpers.retrying({}, function()
t.assert_equals(helpers.list_cluster_issues(router), {})
end)
end

0 comments on commit 9439c74

Please sign in to comment.