fleet/orbit/pkg/dataflatten/flatten_test.go

// based on github.com/kolide/launcher/pkg/osquery/tables
package dataflatten

import (
	"bytes"
	"encoding/json"
	"os"
	"path/filepath"
	"sort"
	"sync"
	"testing"

	"github.com/stretchr/testify/require"
)

type flattenTestCase struct {
	in      string
	out     []Row
	options []FlattenOpts
	comment string
	err     bool
}

func TestFlatten_Complex2(t *testing.T) {
	t.Parallel()

	dataRaw, err := os.ReadFile(filepath.Join("testdata", "complex2.json"))
	require.NoError(t, err, "reading file")
	var dataIn interface{}
	require.NoError(t, json.Unmarshal(dataRaw, &dataIn), "unmarshalling json")

	var tests = []flattenTestCase{
		{
			out: []Row{
				{Path: []string{"addons", "0", "bool1"}, Value: "true"},
				{Path: []string{"addons", "0", "nest2", "0", "string2"}, Value: "foo"},
				{Path: []string{"addons", "0", "nest3", "string6"}, Value: "null"},
				{Path: []string{"addons", "0", "nest3", "string7"}, Value: "A Very Long Sentence"},
				{Path: []string{"addons", "0", "nest3", "string8"}, Value: "short"},
				{Path: []string{"addons", "0", "string1"}, Value: "hello"},
			},
		},
		{
			out: []Row{
				{Path: []string{"addons", "0", "bool1"}, Value: "true"},
				{Path: []string{"addons", "0", "nest2", "0", "null3"}, Value: ""},
				{Path: []string{"addons", "0", "nest2", "0", "null4"}, Value: ""},
				{Path: []string{"addons", "0", "nest2", "0", "string2"}, Value: "foo"},
				{Path: []string{"addons", "0", "nest3", "string3"}, Value: ""},
				{Path: []string{"addons", "0", "nest3", "string4"}, Value: ""},
				{Path: []string{"addons", "0", "nest3", "string5"}, Value: ""},
				{Path: []string{"addons", "0", "nest3", "string6"}, Value: "null"},
				{Path: []string{"addons", "0", "nest3", "string7"}, Value: "A Very Long Sentence"},
				{Path: []string{"addons", "0", "nest3", "string8"}, Value: "short"},
				{Path: []string{"addons", "0", "null1"}, Value: ""},
				{Path: []string{"addons", "0", "null2"}, Value: ""},
				{Path: []string{"addons", "0", "string1"}, Value: "hello"},
			},
			options: []FlattenOpts{IncludeNulls()},
			comment: "includes null",
		},
	}

	for _, tt := range tests {
		tt := tt
		t.Run(tt.comment, func(t *testing.T) {
			t.Parallel()

			actual, err := Flatten(dataIn, tt.options...)
			testFlattenCase(t, tt, actual, err)
		})
	}

}

func TestFlatten_NestingBug(t *testing.T) {
	t.Parallel()

	dataRaw, err := os.ReadFile(filepath.Join("testdata", "nested.json"))
	require.NoError(t, err, "reading file")
	var dataIn interface{}
	require.NoError(t, json.Unmarshal(dataRaw, &dataIn), "unmarshalling json")

	var tests = []flattenTestCase{
		{
			out: []Row{
				{Path: []string{"addons", "0", "name"}, Value: "Nested Strings"},
				{Path: []string{"addons", "0", "nest1", "string3"}, Value: "string3"},
				{Path: []string{"addons", "0", "nest1", "string4"}, Value: "string4"},
				{Path: []string{"addons", "0", "nest1", "string5"}, Value: "string5"},
				{Path: []string{"addons", "0", "nest1", "string6"}, Value: "string6"},
			},
		},
		{
			out: []Row{
				{Path: []string{"addons", "0", "name"}, Value: "Nested Strings"},
				{Path: []string{"addons", "0", "nest1", "string1"}, Value: ""},
				{Path: []string{"addons", "0", "nest1", "string2"}, Value: ""},
				{Path: []string{"addons", "0", "nest1", "string3"}, Value: "string3"},
				{Path: []string{"addons", "0", "nest1", "string4"}, Value: "string4"},
				{Path: []string{"addons", "0", "nest1", "string5"}, Value: "string5"},
				{Path: []string{"addons", "0", "nest1", "string6"}, Value: "string6"},
			},
			options: []FlattenOpts{IncludeNulls()},
			comment: "includes null",
		},
	}

	for _, tt := range tests {
		tt := tt
		t.Run(tt.comment, func(t *testing.T) {
			t.Parallel()

			actual, err := Flatten(dataIn, tt.options...)
			testFlattenCase(t, tt, actual, err)
		})
	}

}

func TestFlatten_Jsonl_Complex(t *testing.T) {
	t.Parallel()

	// Do the unmarshaling here, so we don't keep doing it again and again
	dataRaw, err := os.ReadFile(filepath.Join("testdata", "animals.jsonl"))
	require.NoError(t, err, "reading file")

	// We do a bunch of tests to select this user. So we'll pull
	// this out here and make the testcases more DRY
	testdataUser0 := []Row{
		{Path: []string{"2", "users", "0", "favorites", "0"}, Value: "ants"},
		{Path: []string{"2", "users", "0", "name"}, Value: "Alex Aardvark"},
		{Path: []string{"2", "users", "0", "uuid"}, Value: "abc123"},
		{Path: []string{"2", "users", "0", "id"}, Value: "1"},
	}

	var tests = []flattenTestCase{
		{
			out: []Row{
				{Path: []string{"0", "metadata", "testing"}, Value: "true"},
				{Path: []string{"0", "metadata", "version"}, Value: "1.0.1"},
				{Path: []string{"1", "system"}, Value: "users demo"},
				{Path: []string{"2", "users", "0", "favorites", "0"}, Value: "ants"},
				{Path: []string{"2", "users", "0", "id"}, Value: "1"},
				{Path: []string{"2", "users", "0", "name"}, Value: "Alex Aardvark"},
				{Path: []string{"2", "users", "0", "uuid"}, Value: "abc123"},
				{Path: []string{"2", "users", "1", "favorites", "0"}, Value: "mice"},
				{Path: []string{"2", "users", "1", "favorites", "1"}, Value: "birds"},
				{Path: []string{"2", "users", "1", "id"}, Value: "2"},
				{Path: []string{"2", "users", "1", "name"}, Value: "Bailey Bobcat"},
				{Path: []string{"2", "users", "1", "uuid"}, Value: "def456"},
				{Path: []string{"2", "users", "2", "favorites", "0"}, Value: "seeds"},
				{Path: []string{"2", "users", "2", "id"}, Value: "3"},
				{Path: []string{"2", "users", "2", "name"}, Value: "Cam Chipmunk"},
				{Path: []string{"2", "users", "2", "uuid"}, Value: "ghi789"},
				{Path: []string{"3", "0"}, Value: "array-item-A"},
				{Path: []string{"3", "1"}, Value: "array-item-B"},
				{Path: []string{"3", "2"}, Value: "array-item-C"},
			},
			comment: "all together",
		},
		{
			comment: "query metadata",
			options: []FlattenOpts{WithQuery([]string{"*", "metadata"})},
			out: []Row{
				{Path: []string{"0", "metadata", "testing"}, Value: "true"},
				{Path: []string{"0", "metadata", "version"}, Value: "1.0.1"},
			},
		},
		{
			comment: "array by #",
			options: []FlattenOpts{WithQuery([]string{"*", "users", "0"})},
			out:     testdataUser0,
		},
		{
			comment: "array by id value",
			options: []FlattenOpts{WithQuery([]string{"*", "users", "id=>1"})},
			out:     testdataUser0,
		},
		{
			comment: "array by uuid",
			options: []FlattenOpts{WithQuery([]string{"*", "users", "uuid=>abc123"})},
			out:     testdataUser0,
		},
		{
			comment: "array by name with suffix wildcard",
			options: []FlattenOpts{WithQuery([]string{"*", "users", "name=>Al*"})},
			out:     testdataUser0,
		},
		{
			comment: "array by name with prefix wildcard",
			options: []FlattenOpts{WithQuery([]string{"*", "users", "name=>*Aardvark"})},
			out:     testdataUser0,
		},
		{
			comment: "array by name with suffix and prefix",
			options: []FlattenOpts{WithQuery([]string{"*", "users", "name=>*Aardv*"})},
			out:     testdataUser0,
		},
		{
			comment: "who likes ants, array re-written",
			options: []FlattenOpts{WithQuery([]string{"*", "users", "#name", "favorites", "ants"})},
			out: []Row{
				{Path: []string{"2", "users", "Alex Aardvark", "favorites", "0"}, Value: "ants"},
			},
		},
		{
			comment: "rewritten and filtered",
			options: []FlattenOpts{WithQuery([]string{"*", "users", "#name=>Al*", "id"})},
			out: []Row{
				{Path: []string{"2", "users", "Alex Aardvark", "id"}, Value: "1"},
			},
		},
		{
			comment: "bad key name",
			options: []FlattenOpts{WithQuery([]string{"*", "users", "#nokey"})},
			out:     []Row{},
		},
		{
			comment: "rewrite array to map",
			options: []FlattenOpts{WithQuery([]string{"*", "users", "#name", "id"})},
			out: []Row{
				{Path: []string{"2", "users", "Alex Aardvark", "id"}, Value: "1"},
				{Path: []string{"2", "users", "Bailey Bobcat", "id"}, Value: "2"},
				{Path: []string{"2", "users", "Cam Chipmunk", "id"}, Value: "3"},
			},
		},
	}

	for _, tt := range tests {
		tt := tt
		t.Run(tt.comment, func(t *testing.T) {
			t.Parallel()

			actual, err := Jsonl(bytes.NewReader(dataRaw), tt.options...)
			testFlattenCase(t, tt, actual, err)
		})
	}
}

func TestFlatten_Complex(t *testing.T) {
	t.Parallel()

	// Do the unmarshaling here, so we don't keep doing it again and again
	dataRaw, err := os.ReadFile(filepath.Join("testdata", "animals.json"))
	require.NoError(t, err, "reading file")
	var dataIn interface{}
	require.NoError(t, json.Unmarshal(dataRaw, &dataIn), "unmarshalling json")

	// We do a bunch of tests to select this user. So we'll pull
	// this out here and make the testcases more DRY
	testdataUser0 := []Row{
		{Path: []string{"users", "0", "favorites", "0"}, Value: "ants"},
		{Path: []string{"users", "0", "id"}, Value: "1"},
		{Path: []string{"users", "0", "name"}, Value: "Alex Aardvark"},
		{Path: []string{"users", "0", "uuid"}, Value: "abc123"},
	}

	var tests = []flattenTestCase{
		{
			out: []Row{
				{Path: []string{"metadata", "testing"}, Value: "true"},
				{Path: []string{"metadata", "version"}, Value: "1.0.1"},
				{Path: []string{"system"}, Value: "users demo"},
				{Path: []string{"users", "0", "favorites", "0"}, Value: "ants"},
				{Path: []string{"users", "0", "id"}, Value: "1"},
				{Path: []string{"users", "0", "name"}, Value: "Alex Aardvark"},
				{Path: []string{"users", "0", "uuid"}, Value: "abc123"},
				{Path: []string{"users", "1", "favorites", "0"}, Value: "mice"},
				{Path: []string{"users", "1", "favorites", "1"}, Value: "birds"},
				{Path: []string{"users", "1", "id"}, Value: "2"},
				{Path: []string{"users", "1", "name"}, Value: "Bailey Bobcat"},
				{Path: []string{"users", "1", "uuid"}, Value: "def456"},
				{Path: []string{"users", "2", "favorites", "0"}, Value: "seeds"},
				{Path: []string{"users", "2", "id"}, Value: "3"},
				{Path: []string{"users", "2", "name"}, Value: "Cam Chipmunk"},
				{Path: []string{"users", "2", "uuid"}, Value: "ghi789"},
			},
			comment: "all together",
		},
		{
			options: []FlattenOpts{WithQuery([]string{"metadata"})},
			out: []Row{
				{Path: []string{"metadata", "testing"}, Value: "true"},
				{Path: []string{"metadata", "version"}, Value: "1.0.1"},
			},
		},
		{
			comment: "array by #",
			options: []FlattenOpts{WithQuery([]string{"users", "0"})},
			out:     testdataUser0,
		},
		{
			comment: "array by id value",
			options: []FlattenOpts{WithQuery([]string{"users", "id=>1"})},
			out:     testdataUser0,
		},
		{
			comment: "array by uuid",
			options: []FlattenOpts{WithQuery([]string{"users", "uuid=>abc123"})},
			out:     testdataUser0,
		},
		{
			comment: "array by name with suffix wildcard",
			options: []FlattenOpts{WithQuery([]string{"users", "name=>Al*"})},
			out:     testdataUser0,
		},
		{
			comment: "array by name with prefix wildcard",
			options: []FlattenOpts{WithQuery([]string{"users", "name=>*Aardvark"})},
			out:     testdataUser0,
		},

		{
			comment: "array by name with suffix and prefix",
			options: []FlattenOpts{WithQuery([]string{"users", "name=>*Aardv*"})},
			out:     testdataUser0,
		},
		{
			comment: "who likes ants, array re-written",
			options: []FlattenOpts{WithQuery([]string{"users", "#name", "favorites", "ants"})},
			out: []Row{
				{Path: []string{"users", "Alex Aardvark", "favorites", "0"}, Value: "ants"},
			},
		},
		{
			comment: "rewritten and filtered",
			options: []FlattenOpts{WithQuery([]string{"users", "#name=>Al*", "id"})},
			out: []Row{
				{Path: []string{"users", "Alex Aardvark", "id"}, Value: "1"},
			},
		},
		{
			comment: "bad key name",
			options: []FlattenOpts{WithQuery([]string{"users", "#nokey"})},
			out:     []Row{},
		},
		{
			comment: "rewrite array to map",
			options: []FlattenOpts{WithQuery([]string{"users", "#name", "id"})},
			out: []Row{
				{Path: []string{"users", "Alex Aardvark", "id"}, Value: "1"},
				{Path: []string{"users", "Bailey Bobcat", "id"}, Value: "2"},
				{Path: []string{"users", "Cam Chipmunk", "id"}, Value: "3"},
			},
		},
	}

	for _, tt := range tests {
		tt := tt
		t.Run(tt.comment, func(t *testing.T) {
			t.Parallel()

			actual, err := Flatten(dataIn, tt.options...)
			testFlattenCase(t, tt, actual, err)
		})
	}
}

func TestFlatten_ArrayMaps(t *testing.T) {
	t.Parallel()

	var tests = []flattenTestCase{
		{
			in: `{"data": [{"v":1,"id":"a"},{"v":2,"id":"b"},{"v":3,"id":"c"}]}`,
			out: []Row{
				{Path: []string{"data", "0", "id"}, Value: "a"},
				{Path: []string{"data", "0", "v"}, Value: "1"},

				{Path: []string{"data", "1", "id"}, Value: "b"},
				{Path: []string{"data", "1", "v"}, Value: "2"},

				{Path: []string{"data", "2", "id"}, Value: "c"},
				{Path: []string{"data", "2", "v"}, Value: "3"},
			},
			comment: "nested array as array",
		},
		{
			in: `{"data": [{"v":1,"id":"a"},{"v":2,"id":"b"},{"v":3,"id":"c"}]}`,
			out: []Row{
				{Path: []string{"data", "a", "id"}, Value: "a"},
				{Path: []string{"data", "a", "v"}, Value: "1"},

				{Path: []string{"data", "b", "id"}, Value: "b"},
				{Path: []string{"data", "b", "v"}, Value: "2"},

				{Path: []string{"data", "c", "id"}, Value: "c"},
				{Path: []string{"data", "c", "v"}, Value: "3"},
			},
			options: []FlattenOpts{WithQuery([]string{"data", "#id"})},
			comment: "nested array as map",
		},
	}

	for _, tt := range tests {
		tt := tt
		t.Run(tt.comment, func(t *testing.T) {
			t.Parallel()

			actual, err := Json([]byte(tt.in), tt.options...)
			testFlattenCase(t, tt, actual, err)
		})
	}

}

func TestFlatten(t *testing.T) {
	t.Parallel()

	var tests = []flattenTestCase{
		{
			in:  "a",
			err: true,
		},
		{
			in: `["a", null]`,
			out: []Row{
				{Path: []string{"0"}, Value: "a"},
			},
			comment: "skip null",
		},

		{
			in: `["a", "b", null]`,
			out: []Row{
				{Path: []string{"0"}, Value: "a"},
				{Path: []string{"1"}, Value: "b"},
				{Path: []string{"2"}, Value: ""},
			},
			options: []FlattenOpts{IncludeNulls()},
			comment: "includes null",
		},

		{
			in: `["1"]`,
			out: []Row{
				{Path: []string{"0"}, Value: "1"},
			},
		},
		{
			in: `["a", true, false, "1", 2, 3.3]`,
			out: []Row{
				{Path: []string{"0"}, Value: "a"},
				{Path: []string{"1"}, Value: "true"},
				{Path: []string{"2"}, Value: "false"},
				{Path: []string{"3"}, Value: "1"},
				{Path: []string{"4"}, Value: "2"},
				{Path: []string{"5"}, Value: "3.3"},
			},
			comment: "mixed types",
		},
		{
			in: `{"a": 1, "b": "2.2", "c": [1,2,3]}`,
			out: []Row{
				{Path: []string{"a"}, Value: "1"},
				{Path: []string{"b"}, Value: "2.2"},
				{Path: []string{"c", "0"}, Value: "1"},
				{Path: []string{"c", "1"}, Value: "2"},
				{Path: []string{"c", "2"}, Value: "3"},
			},
			comment: "nested types",
		},
	}

	for _, tt := range tests {
		tt := tt
		t.Run(tt.comment, func(t *testing.T) {
			t.Parallel()

			actual, err := Json([]byte(tt.in), tt.options...)
			testFlattenCase(t, tt, actual, err)
		})
	}
}

func TestFlattenJsonlErrors(t *testing.T) {
	t.Parallel()

	var tests = []flattenTestCase{
		{
			in:  "a",
			err: true,
		},
		{
			// this test case was left over from attempting to parse json that
			// is contained within a file that is not stricly jsonl
			// it should error, maybe look at this again in the future?
			comment: "valid json inline text",
			in:      `valid json is hidden["a"]in me`,
			err:     true,
		},
		{
			// this test case was left over from attempting to parse json that
			// is contained within a file that is not stricly jsonl
			// it should error, maybe look at this again in the future?
			comment: "valid json sandwich",
			in: `
			there is some json under me
			["a"]
			there is some json above me
			`,
			err: true,
		},
	}

	for _, tt := range tests {
		tt := tt
		t.Run(tt.comment, func(t *testing.T) {
			t.Parallel()

			actual, err := Jsonl(bytes.NewBuffer([]byte(tt.in)), tt.options...)
			testFlattenCase(t, tt, actual, err)
		})
	}
}

// add mutext due to data races when running locally, don't seem to appear in CI
// maybe remove if slows down CI too much
var testFlattenCaseMutex sync.Mutex

// testFlattenCase runs tests for a single test case. Normally this
// would be in a for loop, instead it's abstracted here to make it
// simpler to split up a giant array of test cases.
func testFlattenCase(t *testing.T, tt flattenTestCase, actual []Row, actualErr error) {
	testFlattenCaseMutex.Lock()
	defer testFlattenCaseMutex.Unlock()

	if tt.err {
		require.Error(t, actualErr, "test %s %s", tt.in, tt.comment)
		return
	}

	require.NoError(t, actualErr, "test %s %s", tt.in, tt.comment)

	// Despite being an array. data is returned
	// unordered. This greatly complicates our testing. We
	// can either sort it, or use an unordered comparison
	// operator. The `require.ElementsMatch` produces much
	// harder to read diffs, so instead we'll sort things.
	sort.SliceStable(tt.out, func(i, j int) bool { return tt.out[i].StringPath("/") < tt.out[j].StringPath("/") })
	sort.SliceStable(actual, func(i, j int) bool { return actual[i].StringPath("/") < actual[j].StringPath("/") })
	require.EqualValues(t, tt.out, actual, "test %s %s", tt.in, tt.comment)
}

func TestFlattenSliceOfMaps(t *testing.T) {
	t.Parallel()

	tests := []struct {
		name    string
		in      interface{}
		opts    []FlattenOpts
		out     []Row
		wantErr bool
	}{
		{
			name: "single",
			in: []map[string]interface{}{
				{
					"id": "a",
					"v":  1,
				},
			},
			opts: []FlattenOpts{},
			out: []Row{
				{Path: []string{"0", "id"}, Value: "a"},
				{Path: []string{"0", "v"}, Value: "1"},
			},
			wantErr: false,
		},
		{
			name: "multiple",
			in: []map[string]interface{}{
				{
					"id": "a",
					"v":  1,
				},
				{
					"id": "b",
					"v":  2,
				},
				{
					"id": "c",
					"v":  3,
				},
			},
			opts: []FlattenOpts{},
			out: []Row{
				{Path: []string{"0", "id"}, Value: "a"},
				{Path: []string{"0", "v"}, Value: "1"},
				{Path: []string{"1", "id"}, Value: "b"},
				{Path: []string{"1", "v"}, Value: "2"},
				{Path: []string{"2", "id"}, Value: "c"},
				{Path: []string{"2", "v"}, Value: "3"},
			},
			wantErr: false,
		},
		{
			name: "error",
			in: []map[string]interface{}{
				{
					"id": []string{"this should cause an error"},
				},
			},
			opts:    []FlattenOpts{},
			out:     nil,
			wantErr: true,
		},
	}
	for _, tt := range tests {
		tt := tt
		t.Run(tt.name, func(t *testing.T) {
			t.Parallel()

			got, err := Flatten(tt.in, tt.opts...)

			if tt.wantErr {
				require.Error(t, err)
			} else {
				require.NoError(t, err)
			}

			require.ElementsMatch(t, tt.out, got)
		})
	}
}