re-organized runbooks, added generic function for generation, added w…

…iki replace runbook
harsha-simhadri · Oct 10, 2024 · f452d31 · f452d31
1 parent 54b8894
commit f452d31
Show file tree

Hide file tree

Showing 19 changed files with 5,406 additions and 133 deletions.
diff --git a/neurips23/streaming/README.md b/neurips23/streaming/README.md
@@ -15,7 +15,7 @@ Each vector is assumed to have a unique *id* which never changes throughout the
 
 ## Available Runbooks
 
-Now that the number of runbooks has started to increase significantly, here we list the available runbooks with a brief description of each. 
+Now that the number of runbooks has started to increase significantly, here we list the available runbooks (found in the `runbooks` folder within this directory) with a brief description of each. 
 
 1. `simple_runbook.yaml`: A runbook executing a short sequences of insertions, searches, and deletions to aid with debugging and testing.
 2. `simple_replace_runbook.yaml`: A runbook executing a short sequence of inserts, searches, and replaces to aid with debugging and testing.

diff --git a/neurips23/streaming/generate_msmarco100m_runbooks.py b/neurips23/streaming/generate_msmarco100m_runbooks.py
diff --git a/neurips23/streaming/generate_wiki35m_runbooks.py b/neurips23/streaming/generate_wiki35m_runbooks.py
diff --git a/neurips23/streaming/clustered_data_gen.py → .../streaming/runbooks/clustered_data_gen.py b/neurips23/streaming/clustered_data_gen.py → .../streaming/runbooks/clustered_data_gen.py
diff --git a/.../streaming/clustered_replace_runbook.yaml → ...g/runbooks/clustered_replace_runbook.yaml b/.../streaming/clustered_replace_runbook.yaml → ...g/runbooks/clustered_replace_runbook.yaml
diff --git a/neurips23/streaming/clustered_runbook.yaml → ...streaming/runbooks/clustered_runbook.yaml b/neurips23/streaming/clustered_runbook.yaml → ...streaming/runbooks/clustered_runbook.yaml
diff --git a/neurips23/streaming/delete_runbook.yaml → ...23/streaming/runbooks/delete_runbook.yaml b/neurips23/streaming/delete_runbook.yaml → ...23/streaming/runbooks/delete_runbook.yaml
diff --git a/neurips23/streaming/final_runbook.yaml → ...s23/streaming/runbooks/final_runbook.yaml b/neurips23/streaming/final_runbook.yaml → ...s23/streaming/runbooks/final_runbook.yaml
diff --git a/neurips23/streaming/final_runbook_gen.py → ...3/streaming/runbooks/final_runbook_gen.py b/neurips23/streaming/final_runbook_gen.py → ...3/streaming/runbooks/final_runbook_gen.py
diff --git a/neurips23/streaming/runbooks/gen_expiration_time_runbook.py b/neurips23/streaming/runbooks/gen_expiration_time_runbook.py
@@ -0,0 +1,133 @@
+import yaml
+import os
+import random
+
+'''
+dataset_name: dataset key as specified in benchmark/datasets.py
+dataset_size: size of datasets
+max_t: number of timesteps
+runbook_filename: name to save the runbook to
+ratios: tuple of three numbers indicating proportion of deletes/replaces assigned to each timestep
+timesteps: how long to wait before deleting for each ratio
+seed: seed given to random generator
+do_replace: whether to include replace in runbook or not
+'''
+def gen_exp_time_runbook(dataset_name, dataset_size, max_t, runbook_filename, ratios, timesteps, seed = 0, do_replace = False):
+    random.seed(seed)
+    data = {dataset_name: {}}
+
+    max_num_points=0
+    num_points=0
+
+    batch_size = dataset_size//max_t
+    to_delete=[[] for _ in range(max_t)]
+    to_replace=[[] for _ in range(max_t)]
+
+    t=1
+
+    for i in range(max_t): 
+        if do_replace:
+            fraction = random.uniform(.5, .9)
+        else:
+            fraction = 1.0
+        end = int(fraction*(i+1)*batch_size)
+        ids_start = end
+        ids_end = (i+1)*batch_size
+        tags_start = i*batch_size
+        tags_end = tags_start + (ids_end - ids_start)
+        replace_info = (tags_start, tags_end, ids_start, ids_end)
+        delete_info = (tags_start, end)
+        data[dataset_name][t]={
+            'operation': 'insert',
+            'start': i*(batch_size),
+            'end': end
+        }
+        t+=1
+
+        num_points+=batch_size
+
+        max_num_points=max(max_num_points,num_points)
+
+
+        data_type = random.randint(0, ratios[2])
+        if data_type <= ratios[0]:
+            pass
+        elif data_type > ratios[0] and data_type < ratios[1]:
+            if (i+timesteps[1] < max_t):
+                to_delete[i+timesteps[1]].append(delete_info)
+        else:
+            if (i+timesteps[2] < max_t):
+                to_delete[i+timesteps[2]].append(delete_info)
+
+
+
+        if do_replace:
+            if data_type <= ratios[0]:
+                remaining_steps = (max_t - t)//2
+                to_replace[i+remaining_steps].append(replace_info)
+                # with probability 1/19, the points get replaced at t_max-t/2 steps
+            elif data_type > ratios[0] and data_type < ratios[1]:
+                if (i + timesteps[1]//2 < max_t):
+                    to_replace[i+timesteps[1]//2].append(replace_info)
+                # with probability 3/19, the points get replaced after 50 steps
+            else:
+                if (i + timesteps[2]//2 < max_t):
+                    to_replace[i+timesteps[2]//2].append(replace_info)
+                # with probability 15/19, the points get replaced after 10 steps
+
+        for (start, end) in to_delete[i]:
+            data[dataset_name][t]={
+                'operation': 'delete',
+                'start': start,
+                'end': end
+            }
+            t+=1
+            num_points-=batch_size
+
+        for (tags_start, tags_end, ids_start, ids_end) in to_replace[i]:
+            data[dataset_name][t] ={
+                'operation' : 'replace',
+                'tags_start': tags_start,
+                'tags_end': tags_end,
+                'ids_start': ids_start,
+                'ids_end': ids_end
+            }
+            t += 1
+
+        data[dataset_name][t]={
+            'operation': 'search',
+        }
+        t+=1
+
+    data[dataset_name]["max_pts"]=max_num_points
+
+    with open(runbook_filename, 'w') as outfile:
+        yaml.dump(data, outfile, default_flow_style=False)
+
+ratios = (0, 4, 18)
+timesteps = (0, 100, 20)
+seed = 809
+dataset_file = 'wiki_exp_time_runbook.yaml'
+dataset_name = 'wikipedia-35M'
+dataset_size = 35000000
+max_t = 350
+gen_exp_time_runbook(dataset_name, dataset_size, max_t, dataset_file, ratios, timesteps, seed, False)
+
+ratios = (0, 4, 18)
+timesteps = (0, 100, 20)
+seed = 809
+dataset_file = 'wikipedia-35M_expiration_time_replace_runbook.yaml'
+dataset_name = 'wikipedia-35M'
+dataset_size = 35000000
+max_t = 350
+gen_exp_time_runbook(dataset_name, dataset_size, max_t, dataset_file, ratios, timesteps, seed, True)
+
+ratios = (0, 6, 25)
+timesteps = (0, 200, 50)
+seed = 809
+dataset_file = 'msmarco_exp_time_runbook.yaml'
+dataset_name = 'msmarco-100M'
+dataset_size = 101070374
+max_t = 1000
+gen_exp_time_runbook(dataset_name, dataset_size, max_t, dataset_file, ratios, timesteps, seed, False)
+
diff --git a/neurips23/streaming/gen_replace_runbooks.py → ...treaming/runbooks/gen_replace_runbooks.py b/neurips23/streaming/gen_replace_runbooks.py → ...treaming/runbooks/gen_replace_runbooks.py
diff --git a/...treaming/generate_msturing10m_runbooks.py → ...runbooks/generate_msturing10m_runbooks.py b/...treaming/generate_msturing10m_runbooks.py → ...runbooks/generate_msturing10m_runbooks.py
diff --git a/.../msmarco-100M_expirationtime_runbook.yaml → .../msmarco-100M_expirationtime_runbook.yaml b/.../msmarco-100M_expirationtime_runbook.yaml → .../msmarco-100M_expirationtime_runbook.yaml
diff --git a/...g/msturing-10M_slidingwindow_runbook.yaml → ...s/msturing-10M_slidingwindow_runbook.yaml b/...g/msturing-10M_slidingwindow_runbook.yaml → ...s/msturing-10M_slidingwindow_runbook.yaml
diff --git a/...s23/streaming/random_replace_runbook.yaml → ...ming/runbooks/random_replace_runbook.yaml b/...s23/streaming/random_replace_runbook.yaml → ...ming/runbooks/random_replace_runbook.yaml
diff --git a/...s23/streaming/simple_replace_runbook.yaml → ...ming/runbooks/simple_replace_runbook.yaml b/...s23/streaming/simple_replace_runbook.yaml → ...ming/runbooks/simple_replace_runbook.yaml
diff --git a/neurips23/streaming/simple_runbook.yaml → ...23/streaming/runbooks/simple_runbook.yaml b/neurips23/streaming/simple_runbook.yaml → ...23/streaming/runbooks/simple_runbook.yaml