Skip to content

Commit

Permalink
Merge pull request #5 from cloudspannerecosystem/fix_unspecified_inte…
Browse files Browse the repository at this point in the history
…rleaved_load

Fix child tables are included when only parent table is specified
  • Loading branch information
yfuruyama authored Mar 17, 2022
2 parents 77382f4 + 6c682a6 commit cdd9f57
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 46 deletions.
35 changes: 2 additions & 33 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,39 +99,7 @@ tables:
#### Loading into interleaved tables
Loading data into interleaved tables is supported but has some behavioral side effects that should be known. When loading data into an interleaved table, GCSB will detect all tables in the hierarchy and begin loading data at the familial apex. The configured number of operations applies to this apex table. By default, the number of tables for each child table is multiplied by 5. For example:
Using our [test INTERLEAVE schema](schemas/multi_table.sql), we see an INTERLEAVE relationship between the `Singers`, `Albums`, and `Songs` tables.

If we execute a load operation against these tables with total operations set to `10` we will see the following occur

```sh
gcsb load -t Songs -o 10
+---------+------------+------+-------+---------+
| TABLE | OPERATIONS | READ | WRITE | CONTEXT |
+---------+------------+------+-------+---------+
| Singers | 10 | N/A | N/A | LOAD |
| Albums | 50 | N/A | N/A | LOAD |
| Songs | 250 | N/A | N/A | LOAD |
+---------+------------+------+-------+---------+
```

In this case, for each child table we take the number of operations for the parent and multiply it by the default value of `5`.

To change this multiplier, we use the yaml configuration file for the table we want. The `operations.total` value becomes a multiplier.

```yaml
tables:
- name: Albums
operations:
total: 10
- name: Songs
operations:
total: 20
```

At present, GCSB will sort it's operations from the apex down. Meaning it will populate the `Singers` table first and then it's child, and then the next child. Multiple table operations are not mixed within the same transaction.
Loading data into interleaved tables is not supported yet. If you want to create splits in the database, you can load data into parent tables.
### Run
Expand Down Expand Up @@ -206,6 +174,7 @@ The tool supports the following generator type in the configuration.

### Not Supported (yet)

- [ ] Interleaved tables for Load and Run phases.
- [ ] Generating read operations utilizing [ReadByIndex](https://cloud.google.com/spanner/docs/samples/spanner-read-data-with-index#spanner_read_data_with_index-go)
- [ ] Generating NULL values for load operations. If a column is NULLable, gcsb will still generate a value for it.
- [ ] JSON column types
Expand Down
25 changes: 13 additions & 12 deletions pkg/workload/core.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ import (
"errors"
"fmt"
"log"
"sort"
"strings"
"sync"
"time"
Expand Down Expand Up @@ -136,13 +135,17 @@ func (c *CoreWorkload) Initialize() error {

// Plan will create *Targets for each TargetName
func (c *CoreWorkload) Plan(pt JobType, targets []string) error {
var isInterleaved bool
var needOperationMultiplication bool
apexTables := make([]schema.Table, 0)

// search func for looking if targets contains the given string
contains := func(s []string, searchterm string) bool {
i := sort.SearchStrings(s, searchterm)
return i < len(s) && s[i] == searchterm
contains := func(items []string, searchterm string) bool {
for _, item := range items {
if item == searchterm {
return true
}
}
return false
}

// We can only run against one table at a time, so only expand interleaved tables if we are loading
Expand All @@ -155,7 +158,8 @@ func (c *CoreWorkload) Plan(pt JobType, targets []string) error {
}

// If the table is interleaved, find it's entire lineage and add it to the target list
if st.IsInterleaved() {
if st.IsInterleaved() && !st.IsApex() {
needOperationMultiplication = true // Used below
relatives := st.GetAllRelationNames()
for _, n := range relatives {
if n == t { // Avoid inserting t twice for some reason... i dont have time to figure out why this is happenign
Expand All @@ -177,11 +181,8 @@ func (c *CoreWorkload) Plan(pt JobType, targets []string) error {
return fmt.Errorf("table '%s' missing from information schema", t)
}

if st.IsInterleaved() {
isInterleaved = true // Used below
if st.IsApex() {
apexTables = append(apexTables, st) // Collect a slice of apex tables
}
if st.IsInterleaved() && st.IsApex() {
apexTables = append(apexTables, st) // Collect a slice of apex tables
}

// Create target
Expand Down Expand Up @@ -271,7 +272,7 @@ func (c *CoreWorkload) Plan(pt JobType, targets []string) error {
}

// So if our phase is load, the operations per target are actually multipliers. Now we go through and do that multiplication
if pt == JobLoad && isInterleaved {
if pt == JobLoad && needOperationMultiplication {
for _, at := range apexTables {
apexTarget := FindTargetByName(c.plan, at.Name())

Expand Down
96 changes: 95 additions & 1 deletion pkg/workload/core_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
package workload

import "testing"
import (
"testing"

"github.com/cloudspannerecosystem/gcsb/pkg/config"
"github.com/cloudspannerecosystem/gcsb/pkg/schema"
)

func TestBucketOps(t *testing.T) {
tests := []struct {
Expand Down Expand Up @@ -38,6 +43,66 @@ func TestBucketOps(t *testing.T) {
}
}

func TestPlan(t *testing.T) {
testSchema := schema.NewSchema()
t1 := schema.NewTable()
t1.SetName("Singers")
testSchema.Tables().AddTable(t1)

t2 := schema.NewTable()
t2.SetName("Albums")
t2.SetParent(t1)
t2.SetParentName(t1.Name())
t1.SetChild(t2)
t1.SetChildName(t2.Name())
testSchema.Tables().AddTable(t2)

tests := []struct {
desc string
initialTargets []string
wantTargets []string
}{
{
desc: "no tables are planned",
initialTargets: []string{},
wantTargets: []string{},
},
{
desc: "only parent table is planned",
initialTargets: []string{"Singers"},
wantTargets: []string{"Singers"},
},
{
desc: "parent table is also planned",
initialTargets: []string{"Albums"},
wantTargets: []string{"Singers", "Albums"},
},
{
desc: "all tables are planned",
initialTargets: []string{"Singers", "Albums"},
wantTargets: []string{"Singers", "Albums"},
},
}
for _, test := range tests {
t.Run(test.desc, func(t *testing.T) {
workload := CoreWorkload{
Schema: testSchema,
Config: &config.Config{},
}

err := workload.Plan(JobLoad, test.initialTargets)
if err != nil {
t.Fatalf("workload.Plan got error: %v", err)
}

got := extractTableNamesFromTargets(workload.plan)
if !isSameStringSet(got, test.wantTargets) {
t.Errorf("workload.Plan(%v) = %v, but want = %v", test.initialTargets, got, test.wantTargets)
}
})
}
}

func isSameSlice(a, b []int) bool {
if len(a) != len(b) {
return false
Expand All @@ -49,3 +114,32 @@ func isSameSlice(a, b []int) bool {
}
return true
}

func isSameStringSet(a, b []string) bool {
if len(a) != len(b) {
return false
}

counters := make(map[string]int)
for _, v := range a {
counters[v]++
}
for _, v := range b {
counters[v]--
}

for _, counter := range counters {
if counter != 0 {
return false
}
}
return true
}

func extractTableNamesFromTargets(targets []*Target) []string {
var names []string
for _, t := range targets {
names = append(names, t.TableName)
}
return names
}

0 comments on commit cdd9f57

Please sign in to comment.