// Package dataflatten contains tools to flatten complex data // structures. // // On macOS, many plists use an array of maps, these can be tricky to // filter. This package knows how to flatten that structure, as well // as rewriting it as a nested array, or filtering it. It is akin to // xpath, though simpler. // // This tool works primarily through string interfaces, so type // information may be lost. // // # Query Syntax // // The query syntax handles both filtering and basic rewriting. It is // not perfect. The idea behind it, is that we descend through an data // structure, specifying what matches at each level. // // Each level of query can do: // - specify a filter, this is a simple string match with wildcard support. (prefix and/or postfix, but not infix) // - If the data is an array, specify an index // - For array-of-maps, specify a key to rewrite as a nested map // // Each query term has 3 parts: [#]string[=>kvmatch] // // 1. An optional `#` This denotes a key to rewrite an array-of-maps with // // 2. A search term. If this is an integer, it is interpreted as an array index. // // 3. a key/value match string. For a map, this is to match the value of a key. // // Some examples: // * data/users Return everything under { data: { users: { ... } } } // * data/users/0 Return the first item in the users array // * data/users/name=>A* Return users whose name starts with "A" // * data/users/#id Return the users, and rewrite the users array to be a map with the id as the key // // See the test suite for extensive examples. // based on github.com/kolide/launcher/pkg/osquery/tables package dataflatten import ( "bytes" "encoding/base64" "fmt" "strconv" "strings" "time" "unicode/utf8" "github.com/micromdm/plist" "github.com/rs/zerolog" howett "howett.net/plist" ) // Flattener is an interface to flatten complex, nested, data // structures. It recurses through them, and returns a simplified // form. At the simplest level, this rewrites: // // { foo: { bar: { baz: 1 } } } // // To: // // [ { path: foo/bar/baz, value: 1 } ] // // It can optionally filtering and rewriting. type Flattener struct { debugLogging bool expandNestedPlist bool includeNestedRaw bool includeNils bool logger zerolog.Logger query []string queryKeyDenoter string queryWildcard string rows []Row } type FlattenOpts func(*Flattener) // IncludeNulls indicates that Flatten should return null values, // instead of skipping over them. func IncludeNulls() FlattenOpts { return func(fl *Flattener) { fl.includeNils = true } } // WithNestedPlist indicates that nested plists should be expanded func WithNestedPlist() FlattenOpts { return func(fl *Flattener) { fl.expandNestedPlist = true } } // WithLogger sets the logger to use func WithLogger(logger zerolog.Logger) FlattenOpts { return func(fl *Flattener) { fl.logger = logger } } // WithDebugLogging enables debug logging. With debug logs, // dataflatten is very verbose. This can overwhelm the other launcher // logs. As we're not generally debugging this library, the default is // to not enable debug logging. func WithDebugLogging() FlattenOpts { return func(fl *Flattener) { fl.debugLogging = true } } // WithQuery Specifies a query to flatten with. This is used both for // re-writing arrays into maps, and for filtering. See "Query // Specification" for docs. func WithQuery(q []string) FlattenOpts { if len(q) == 0 || (len(q) == 1 && q[0] == "") { return func(_ *Flattener) {} } return func(fl *Flattener) { fl.query = q } } // Flatten is the entry point to the Flattener functionality. func Flatten(data interface{}, opts ...FlattenOpts) ([]Row, error) { fl := &Flattener{ rows: []Row{}, logger: zerolog.Nop(), queryWildcard: `*`, queryKeyDenoter: `#`, } for _, opt := range opts { opt(fl) } if !fl.debugLogging { fl.logger = fl.logger.Level(zerolog.InfoLevel) } if err := fl.descend([]string{}, data, 0); err != nil { return nil, err } return fl.rows, nil } // descend recurses through a given data structure flattening along the way. func (fl *Flattener) descend(path []string, data interface{}, depth int) error { queryTerm, isQueryMatched := fl.queryAtDepth(depth) logger := fl.logger.With(). Str("caller", "descend"). Int("depth", depth). Int("rows-so-far", len(fl.rows)). Str("query", queryTerm). Str("path", strings.Join(path, "/")). Logger() switch v := data.(type) { case []interface{}: for i, e := range v { pathKey := strconv.Itoa(i) logger.Debug().Str("indexStr", pathKey).Msg("checking an array") // If the queryTerm starts with // queryKeyDenoter, then we want to rewrite // the path based on it. Note that this does // no sanity checking. Multiple values will // re-write. If the value isn't there, you get // nothing. Etc. // // keyName == "name" // keyValue == "alex" (need to test this againsty queryTerm // pathKey == What we descend with if strings.HasPrefix(queryTerm, fl.queryKeyDenoter) { keyQuery := strings.SplitN(strings.TrimPrefix(queryTerm, fl.queryKeyDenoter), "=>", 2) keyName := keyQuery[0] innerlogger := logger.With().Str("arraykeyname", keyName).Logger() logger.Debug().Msg("attempting to coerce array into map") e, ok := e.(map[string]interface{}) if !ok { innerlogger.Debug().Msg("can't coerce into map") continue } // Is keyName in this array? val, ok := e[keyName] if !ok { innerlogger.Debug().Msg("keyName not in map") continue } pathKey, ok = val.(string) if !ok { innerlogger.Debug().Msg("can't coerce pathKey val into string") continue } // Looks good to descend. we're overwritten both e and pathKey. Exit this conditional. } if !(isQueryMatched || fl.queryMatchArrayElement(e, i, queryTerm)) { logger.Debug().Msg("query not matched") continue } if err := fl.descend(append(path, pathKey), e, depth+1); err != nil { return fmt.Errorf("flattening array: %w", err) } } case map[string]interface{}: logger.Debug().Msg("checking a map") for k, e := range v { // Check that the key name matches. If not, skip this entire // branch of the map if !(isQueryMatched || fl.queryMatchString(k, queryTerm)) { continue } if err := fl.descend(append(path, k), e, depth+1); err != nil { return fmt.Errorf("flattening map: %w", err) } } case []map[string]interface{}: logger.Debug().Msg("checking an array of maps") for i, e := range v { if err := fl.descend(append(path, strconv.Itoa(i)), e, depth+1); err != nil { return fmt.Errorf("flattening array of maps: %w", err) } } case nil: // Because we want to filter nils out, we do _not_ examine isQueryMatched here if !(fl.queryMatchNil(queryTerm)) { logger.Debug().Msg("query not matched") return nil } fl.rows = append(fl.rows, NewRow(path, "")) case string: return fl.descendMaybePlist(path, []byte(v), depth) case []byte: // Most string like data comes in this way return fl.descendMaybePlist(path, v, depth) default: if err := fl.handleStringLike(logger, path, v, depth); err != nil { return fmt.Errorf("flattening at path %v: %w", path, err) } } return nil } // handleStringLike is called when we finally have an object we think // can be converted to a string. It uses the depth to compare against // the query, and returns a stringify'ed value func (fl *Flattener) handleStringLike(logger zerolog.Logger, path []string, v interface{}, depth int) error { queryTerm, isQueryMatched := fl.queryAtDepth(depth) stringValue, err := stringify(v) if err != nil { return err } if !(isQueryMatched || fl.queryMatchString(stringValue, queryTerm)) { logger.Debug().Msg("query not matched") return nil } fl.rows = append(fl.rows, NewRow(path, stringValue)) return nil } // descendMaybePlist optionally tries to decode []byte data as an // embedded plist. In the case of failures, it falls back to treating // it like a plain string. func (fl *Flattener) descendMaybePlist(path []string, data []byte, depth int) error { logger := fl.logger.With(). Str("caller", "descendMaybePlist"). Int("depth", depth). Int("rows-so-far", len(fl.rows)). Str("path", strings.Join(path, "/")). Logger() // Skip if we're not expanding nested plists if !fl.expandNestedPlist { return fl.handleStringLike(logger, path, data, depth) } // Skip if this doesn't look like a plist. if !isPlist(data) { return fl.handleStringLike(logger, path, data, depth) } // Looks like a plist. Try parsing it logger.Debug().Msg("Parsing inner plist") var innerData interface{} if err := plist.Unmarshal(data, &innerData); err != nil { logger.Info().Err(err).Msg("plist parsing failed") return fl.handleStringLike(logger, path, data, depth) } // have a parsed plist. Descend and return from here. if fl.includeNestedRaw { if err := fl.handleStringLike(logger, append(path, "_raw"), data, depth); err != nil { logger.Error().Err(err).Msg("Failed to add _raw key") } } if err := fl.descend(path, innerData, depth); err != nil { return fmt.Errorf("flattening plist data: %w", err) } return nil } func (fl *Flattener) queryMatchNil(queryTerm string) bool { // TODO: If needed, we could use queryTerm for optional nil filtering return fl.includeNils } // queryMatchArrayElement matches arrays. This one is magic. // // Syntax: // // #i -- Match index i. For example `#0` // k=>queryTerm -- If this is a map, it should have key k, that matches queryTerm // // We use `=>` as something that is reasonably intuitive, and not very // likely to occur on it's own. Unfortunately, `==` shows up in base64 func (fl *Flattener) queryMatchArrayElement(data interface{}, arrIndex int, queryTerm string) bool { logger := fl.logger.With(). Str("caller", "queryMatchArrayElement"). Int("rows-so-far", len(fl.rows)). Str("query", queryTerm). Int("arrIndex", arrIndex). Logger() // strip off the key re-write denotation before trying to match queryTerm = strings.TrimPrefix(queryTerm, fl.queryKeyDenoter) if queryTerm == fl.queryWildcard { return true } // If the queryTerm is an int, then we expect to match the index if queryIndex, err := strconv.Atoi(queryTerm); err == nil { logger.Debug().Msg("using numeric index comparison") return queryIndex == arrIndex } logger.Debug().Msg("checking data type") switch dataCasted := data.(type) { case []interface{}: // fails. We can't match an array that has arrays as elements. Use a wildcard return false case map[string]interface{}: kvQuery := strings.SplitN(queryTerm, "=>", 2) // If this is one long, then we're testing for whether or not there's a key with this name, if len(kvQuery) == 1 { _, ok := dataCasted[kvQuery[0]] return ok } // Else see if the value matches for k, v := range dataCasted { // Since this needs to check against _every_ // member, return true. Or fall through to the // false. if fl.queryMatchString(k, kvQuery[0]) && fl.queryMatchStringify(v, kvQuery[1]) { return true } } return false default: // non-iterable. stringify and be done return fl.queryMatchStringify(dataCasted, queryTerm) } } func (fl *Flattener) queryMatchStringify(data interface{}, queryTerm string) bool { // strip off the key re-write denotation before trying to match queryTerm = strings.TrimPrefix(queryTerm, fl.queryKeyDenoter) if queryTerm == fl.queryWildcard { return true } if data == nil { return fl.queryMatchNil(queryTerm) } stringValue, err := stringify(data) if err != nil { return false } return fl.queryMatchString(stringValue, queryTerm) } func (fl *Flattener) queryMatchString(v, queryTerm string) bool { if queryTerm == fl.queryWildcard { return true } // Some basic string manipulations to handle prefix and suffix operations switch { case strings.HasPrefix(queryTerm, fl.queryWildcard) && strings.HasSuffix(queryTerm, fl.queryWildcard): queryTerm = strings.TrimPrefix(queryTerm, fl.queryWildcard) queryTerm = strings.TrimSuffix(queryTerm, fl.queryWildcard) return strings.Contains(v, queryTerm) case strings.HasPrefix(queryTerm, fl.queryWildcard): queryTerm = strings.TrimPrefix(queryTerm, fl.queryWildcard) return strings.HasSuffix(v, queryTerm) case strings.HasSuffix(queryTerm, fl.queryWildcard): queryTerm = strings.TrimSuffix(queryTerm, fl.queryWildcard) return strings.HasPrefix(v, queryTerm) } return v == queryTerm } // queryAtDepth returns the query parameter for a given depth, and // boolean indicating we've run out of queries. If we've run out of // queries, than we can start checking, everything is a match. func (fl *Flattener) queryAtDepth(depth int) (string, bool) { // if we're nil, there's an implied wildcard // // This works because: // []string is len 0, and nil // []string{} is len 0, but not nil if fl.query == nil { return fl.queryWildcard, true } // If there's no query for this depth, then there's an implied // wildcard. This allows the query to specify prefixes. if depth+1 > len(fl.query) { return fl.queryWildcard, true } q := fl.query[depth] return q, q == fl.queryWildcard } // stringify takes an arbitrary piece of data, and attempst to coerce // it into a string. func stringify(data interface{}) (string, error) { switch v := data.(type) { case nil: return "", nil case string: return v, nil case []byte: s := string(v) if utf8.ValidString(s) { return s, nil } return base64.StdEncoding.EncodeToString(v), nil case uint8: return strconv.FormatUint(uint64(v), 10), nil case uint16: return strconv.FormatUint(uint64(v), 10), nil case uint32: return strconv.FormatUint(uint64(v), 10), nil case uint64: return strconv.FormatUint(v, 10), nil case float32: return strconv.FormatFloat(float64(v), 'f', -1, 32), nil case float64: return strconv.FormatFloat(v, 'f', -1, 64), nil case int: return strconv.Itoa(v), nil case int8: return strconv.FormatInt(int64(v), 10), nil case int16: return strconv.FormatInt(int64(v), 10), nil case int32: return strconv.FormatInt(int64(v), 10), nil case int64: return strconv.FormatInt(v, 10), nil case bool: return strconv.FormatBool(v), nil case time.Time: return strconv.FormatInt(v.Unix(), 10), nil case howett.UID: return strconv.FormatUint(uint64(v), 10), nil case fmt.Stringer: return v.String(), nil default: // spew.Dump(data) return "", fmt.Errorf("unknown type on %v", data) } } // isPlist returns whether or not something looks like it might be a // plist. It uses Contains, instead of HasPrefix, as some encodings // have a leading character. func isPlist(data []byte) bool { var dataPrefix []byte if len(data) <= 30 { dataPrefix = data } else { dataPrefix = data[0:30] } if bytes.Contains(dataPrefix, []byte("bplist0")) { return true } if bytes.Contains(dataPrefix, []byte(`xml version="1.0"`)) && bytes.Contains(data, []byte(`