Skip to content

Commit

Permalink
re-organized runbooks, added generic function for generation, added w…
Browse files Browse the repository at this point in the history
…iki replace runbook
  • Loading branch information
magdalendobson committed Oct 10, 2024
1 parent 54b8894 commit f452d31
Show file tree
Hide file tree
Showing 19 changed files with 5,406 additions and 133 deletions.
2 changes: 1 addition & 1 deletion neurips23/streaming/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Each vector is assumed to have a unique *id* which never changes throughout the

## Available Runbooks

Now that the number of runbooks has started to increase significantly, here we list the available runbooks with a brief description of each.
Now that the number of runbooks has started to increase significantly, here we list the available runbooks (found in the `runbooks` folder within this directory) with a brief description of each.

1. `simple_runbook.yaml`: A runbook executing a short sequences of insertions, searches, and deletions to aid with debugging and testing.
2. `simple_replace_runbook.yaml`: A runbook executing a short sequence of inserts, searches, and replaces to aid with debugging and testing.
Expand Down
67 changes: 0 additions & 67 deletions neurips23/streaming/generate_msmarco100m_runbooks.py

This file was deleted.

65 changes: 0 additions & 65 deletions neurips23/streaming/generate_wiki35m_runbooks.py

This file was deleted.

File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
133 changes: 133 additions & 0 deletions neurips23/streaming/runbooks/gen_expiration_time_runbook.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import yaml
import os
import random

'''
dataset_name: dataset key as specified in benchmark/datasets.py
dataset_size: size of datasets
max_t: number of timesteps
runbook_filename: name to save the runbook to
ratios: tuple of three numbers indicating proportion of deletes/replaces assigned to each timestep
timesteps: how long to wait before deleting for each ratio
seed: seed given to random generator
do_replace: whether to include replace in runbook or not
'''
def gen_exp_time_runbook(dataset_name, dataset_size, max_t, runbook_filename, ratios, timesteps, seed = 0, do_replace = False):
random.seed(seed)
data = {dataset_name: {}}

max_num_points=0
num_points=0

batch_size = dataset_size//max_t
to_delete=[[] for _ in range(max_t)]
to_replace=[[] for _ in range(max_t)]

t=1

for i in range(max_t):
if do_replace:
fraction = random.uniform(.5, .9)
else:
fraction = 1.0
end = int(fraction*(i+1)*batch_size)
ids_start = end
ids_end = (i+1)*batch_size
tags_start = i*batch_size
tags_end = tags_start + (ids_end - ids_start)
replace_info = (tags_start, tags_end, ids_start, ids_end)
delete_info = (tags_start, end)
data[dataset_name][t]={
'operation': 'insert',
'start': i*(batch_size),
'end': end
}
t+=1

num_points+=batch_size

max_num_points=max(max_num_points,num_points)


data_type = random.randint(0, ratios[2])
if data_type <= ratios[0]:
pass
elif data_type > ratios[0] and data_type < ratios[1]:
if (i+timesteps[1] < max_t):
to_delete[i+timesteps[1]].append(delete_info)
else:
if (i+timesteps[2] < max_t):
to_delete[i+timesteps[2]].append(delete_info)



if do_replace:
if data_type <= ratios[0]:
remaining_steps = (max_t - t)//2
to_replace[i+remaining_steps].append(replace_info)
# with probability 1/19, the points get replaced at t_max-t/2 steps
elif data_type > ratios[0] and data_type < ratios[1]:
if (i + timesteps[1]//2 < max_t):
to_replace[i+timesteps[1]//2].append(replace_info)
# with probability 3/19, the points get replaced after 50 steps
else:
if (i + timesteps[2]//2 < max_t):
to_replace[i+timesteps[2]//2].append(replace_info)
# with probability 15/19, the points get replaced after 10 steps

for (start, end) in to_delete[i]:
data[dataset_name][t]={
'operation': 'delete',
'start': start,
'end': end
}
t+=1
num_points-=batch_size

for (tags_start, tags_end, ids_start, ids_end) in to_replace[i]:
data[dataset_name][t] ={
'operation' : 'replace',
'tags_start': tags_start,
'tags_end': tags_end,
'ids_start': ids_start,
'ids_end': ids_end
}
t += 1

data[dataset_name][t]={
'operation': 'search',
}
t+=1

data[dataset_name]["max_pts"]=max_num_points

with open(runbook_filename, 'w') as outfile:
yaml.dump(data, outfile, default_flow_style=False)

ratios = (0, 4, 18)
timesteps = (0, 100, 20)
seed = 809
dataset_file = 'wiki_exp_time_runbook.yaml'
dataset_name = 'wikipedia-35M'
dataset_size = 35000000
max_t = 350
gen_exp_time_runbook(dataset_name, dataset_size, max_t, dataset_file, ratios, timesteps, seed, False)

ratios = (0, 4, 18)
timesteps = (0, 100, 20)
seed = 809
dataset_file = 'wikipedia-35M_expiration_time_replace_runbook.yaml'
dataset_name = 'wikipedia-35M'
dataset_size = 35000000
max_t = 350
gen_exp_time_runbook(dataset_name, dataset_size, max_t, dataset_file, ratios, timesteps, seed, True)

ratios = (0, 6, 25)
timesteps = (0, 200, 50)
seed = 809
dataset_file = 'msmarco_exp_time_runbook.yaml'
dataset_name = 'msmarco-100M'
dataset_size = 101070374
max_t = 1000
gen_exp_time_runbook(dataset_name, dataset_size, max_t, dataset_file, ratios, timesteps, seed, False)

File renamed without changes.
File renamed without changes.
Loading

0 comments on commit f452d31

Please sign in to comment.