-
Notifications
You must be signed in to change notification settings - Fork 1
/
find_violations.py
158 lines (129 loc) · 9.5 KB
/
find_violations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
"""
This code is essentially a selective port of the v3 minting code, only including the parts needed to find farmerbot related violations. It reads from a sqlite database as generated by the ingester code, parses the events, and returns a list of any violations for that node.
"""
import sys, sqlite3, collections, logging
from dataclasses import dataclass
import grid3.minting
POST_PERIOD = 60 * 60 * 27
PERIOD_CATCH = 30
MAX_BOOT_TIME = 60 * 30
# It turns out that namedtuples might not be the most performant option for how we are using them, but I didn't find a substantial improvement when switching to slotted data classes and these definitions are much more compact :)
NodeUptimeReported = collections.namedtuple('NodeUptimeReported', 'uptime, timestamp, event_index')
PowerTargetChanged = collections.namedtuple('PowerTargetChanged', 'target, timestamp, event_index')
PowerStateChanged = collections.namedtuple('PowerStateChanged', 'state, timestamp, event_index')
# Since Telegram bot's pickle persistence doesn't play nice with namedtuples, we use a slotted data class here instead. Technically this class represents both actual violations and possible violations. In the second case, finalized is set to false. Including the end time of the period we have checked allows for comparing the boot_requested time with the amount of time that has elapsed (in terms of the timestamps of tfchain blocks we've actually processed) to decide how likely it is that a violation has actually occurred
@dataclass
class Violation:
__slots__ = 'boot_requested', 'booted_at', 'finalized', 'end_time'
boot_requested: int
booted_at: int
finalized: bool
end_time: int
def check_node(con, node, period, verbose=False):
# Checkpoints indicate the last block number and associated timestamp for which all block data has been ingested and processed. We don't want to assume a node has a violation if block processing is behind current time
checkpoint_time = con.execute("SELECT value FROM kv WHERE key='checkpoint_time'").fetchone()[0]
# Nodes have 30 minutes to wake up, so we need to check enough uptime events to see if they manage to wake up after the period has ended. Since the boot time and the time of submitting uptime are different events, and the uptime report can come much later, the post period duration is the effective limit on how long a node can spend "booting up" at the end of the period before getting a violation. We (now) use the same value as minting (27 hours) so that we reach the same conclusion as minting about whether to assign a violation or not
if checkpoint_time > period.end + POST_PERIOD:
end_time = period.end + POST_PERIOD
period_finished = True
else:
end_time = checkpoint_time
period_finished = False
uptimes = con.execute('SELECT uptime, timestamp, event_index FROM NodeUptimeReported WHERE node_id=? AND timestamp>=? AND timestamp<=?', (node, period.start, end_time)).fetchall()
targets = con.execute('SELECT target, timestamp, event_index FROM PowerTargetChanged WHERE node_id=? AND timestamp>=? AND timestamp<=?', (node, period.start, end_time)).fetchall()
states = con.execute('SELECT state, timestamp, event_index FROM PowerStateChanged WHERE node_id=? AND timestamp>=? AND timestamp<=?', (node, period.start, end_time)).fetchall()
# Since we only fetch initial power configs for the beginning of each period, there's no risk of fetching the wrong one unless we're off by a month. On the other hand, getting the exact timestamp of the block or the block number is relatively expensive, so we use a bit of a hack here. Maybe a better approach is caching the period start/end info inside the db
initial_power = con.execute('SELECT state, down_time, target, timestamp FROM PowerState WHERE node_id=? AND timestamp>=? AND timestamp<=?', [node, (period.start - PERIOD_CATCH), (period.start + PERIOD_CATCH)]).fetchone()
# If there's no entry in the db, it would mean either the node was not created yet at this point in time (thus the default value), or the fetching of this data is not completed. The latter case is potentially problematic, but as long as we get the data eventually, we will catch any associated violations eventually too
if initial_power is None:
initial_power = 'Up', None, 'Up', None
state, down_time, target, timestamp = initial_power
if state == 'Down':
# This is now using the same approach as minting (that is, we only care about the actual time the node went to sleep, not when a boot was requested if it happened in the previous minting period). While maybe not immediately obvious, we need the time the node went to sleep here to correctly check if the boot time is greater below
power_managed = down_time
if target == 'Up':
power_manage_boot = timestamp # Block time of first block in period
else:
power_manage_boot = None
else:
power_managed = None
power_manage_boot = None
events = []
events.extend([NodeUptimeReported(*u) for u in uptimes])
events.extend([PowerStateChanged(*s) for s in states])
events.extend([PowerTargetChanged(*t) for t in targets])
events = sorted(events, key=lambda e: (e.timestamp, e.event_index))
violations = []
timestamp = period.start
uptime = None
total_uptime = 0
for event in events:
if verbose:
print(event)
if isinstance(event, NodeUptimeReported):
if power_managed is not None and power_manage_boot is not None:
boot_time = event.timestamp - event.uptime
if boot_time > power_managed:
if verbose:
standby_hours = (boot_time - power_managed) / 60 / 60
print('Node booted at', boot_time, 'Hours in standby: ', standby_hours)
if standby_hours < 24 and boot_time < power_manage_boot + MAX_BOOT_TIME:
total_uptime += min(boot_time - power_managed, boot_time - period.start)
if boot_time > power_manage_boot + MAX_BOOT_TIME:
if verbose:
print('About to return a violation for this uptime event:', event)
violations.append(Violation(power_manage_boot, boot_time, True, None))
power_managed = None
power_manage_boot = None
if verbose:
elapsed = event.timestamp - timestamp
if uptime is None:
# First uptime report of the period, scale to actual time in period so far
if event.uptime > elapsed:
uptime = elapsed
else:
uptime = event.uptime
total_uptime += uptime
elif event.uptime < uptime:
print('Reboot detected. Elapsed time: ', elapsed, 'Uptime accrued: ', event.uptime)
uptime = event.uptime
total_uptime += uptime
else:
print('Elapsed time: ', elapsed, 'Uptime accrued: ', event.uptime - uptime)
total_uptime += event.uptime - uptime
uptime = event.uptime
timestamp = event.timestamp
elif isinstance(event, PowerTargetChanged):
# We don't want to check boots requested during the post period. Those will get checked during the next cycle
if event.target == 'Up' and state == 'Down' and power_manage_boot is None and event.timestamp < period.end:
power_manage_boot = event.timestamp
target = event.target
elif isinstance(event, PowerStateChanged):
if state == 'Up' and target == 'Down' and event.state == 'Down':
if power_managed is None:
power_managed = event.timestamp
state = event.state
if verbose:
print('power_managed:', power_managed, 'power_manage_boot:', power_manage_boot)
# There are two scenarios here. First is that we are scanning a completed minting period that ended longer ago than the POST_PERIOD duration. In that case these will be "never booted" violations. The other is that we are scanning an ongoing minting period (or one that ended very recently) and the MAX_BOOT_TIME has elapsed. In the second case we don't actually know if a violation will happen for the node, because boot time is timestamp - uptime. So if the node's uptime counter is already running and it successfully submits an uptime report later, then no violation happens. We mark these as unfinalized
if power_manage_boot and end_time > power_manage_boot + MAX_BOOT_TIME:
finalized = period_finished
violations.append(Violation(power_manage_boot, None, finalized, end_time))
if verbose:
if power_managed and period.end - power_managed < 24 * 60 * 60:
print("Node is standby at end of period, crediting additional uptime: ", period.end - power_managed)
total_uptime += period.end - power_managed
print("Total uptime accumulated: ", total_uptime)
return violations
if __name__ == '__main__':
DB = sys.argv[1]
NODE = int(sys.argv[2])
TIME = int(sys.argv[3]) #Pass any timestamp in the period to be checked
try:
sys.argv[4]
VERBOSE = True
except IndexError:
VERBOSE = False
con = sqlite3.connect(DB)
period = grid3.minting.Period(TIME)
print(check_node(con, NODE, grid3.minting.Period(TIME), VERBOSE))