aboutsummaryrefslogtreecommitdiff
path: root/columnar/columnar.go
blob: 7236901f66f8ccbd0ac886791501780287640d3c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
// SPDX-FileCopyrightText: 2026 Stefan Majewsky <majewsky@gmx.net>
// SPDX-License-Identifier: Apache-2.0

// Package columnarjson provides efficient encoding of lists of objects in a columnar JSON format.
//
// The standard way of encoding a list of objects in JSON looks like this:
//
//	[
//		{ "id": 1, "first_name": "Alice", "last_name": "Allison", "married": false },
//		{ "id": 2, "first_name": "Bob", "last_name": "Burger", "married": true },
//		{ "id": 3, "first_name": "Carol", "last_name": "Callagher", "married": true }
//	]
//
// Encoding the same list in a columnar fashion results in this:
//
//	{
//		"id": [1, 2, 3],
//		"first_name": ["Alice", "Bob", "Carol"],
//		"last_name": ["Allison", "Burger", "Callagher"],
//		"married": [false, true, true]
//	}
//
// In this example, changing the encoding from row-wise to columnar reduced the
// (minified) size of the JSON encoding from 202 to 124 bytes.
//
// This package eliminates the boilerplate code that would be associated with
// converting a list of objects into the respective columnar form before
// marshaling, and vice versa after unmarshaling.

package columnar

import (
	"encoding/json"
	"fmt"
	"maps"
	"reflect"
	"slices"
)

// NOTE: naming convention for variables
//
// single-letter = original type (t = reflect.Type, v = reflect.Value, f = reflect.StructField)
// with "c" prefix = columnar type (ct = reflect.Type, cv = reflect.Value, cf = reflect.StructField)

// prove interface implementations
var _ interface {
	json.Marshaler
	json.Unmarshaler
} = &List[bool]{}

// cache for auto-generated columnar struct types
var columnarListTypes = map[reflect.Type]reflect.Type{}

// List provides columnar marshaling for lists of objects.
// T must be a struct type or a pointer to one, otherwise all methods on this type will panic.
//
// Please refer to the package docstring for how this type is marshaled.
type List[T any] []T

func foreachRelevantField(t reflect.Type, action func(f reflect.StructField)) {
	for idx := range t.NumField() {
		f := t.Field(idx)
		if f.PkgPath == "" {
			action(f)
		}
	}
}

func getColumnarType(t reflect.Type) reflect.Type {
	if t.Kind() != reflect.Struct {
		zero := reflect.New(t).Elem().Interface()
		panic(fmt.Sprintf("type %T is not a struct or pointer to a struct", zero))
	}

	result, ok := columnarListTypes[t]
	if ok {
		return result
	}

	var fields []reflect.StructField
	foreachRelevantField(t, func(f reflect.StructField) {
		fields = append(fields, reflect.StructField{
			Name: f.Name,
			Type: reflect.SliceOf(f.Type),
			Tag:  f.Tag,
		})
	})

	result = reflect.StructOf(fields)
	columnarListTypes[t] = result
	return result
}

// MarshalJSON implements the [json.Marshaler] interface.
func (l List[T]) MarshalJSON() ([]byte, error) {
	t := reflect.TypeFor[T]()
	for t.Kind() == reflect.Pointer {
		t = t.Elem()
	}
	ct := getColumnarType(t)
	cv := reflect.New(ct).Elem()

	columns := make(map[string]reflect.Value, t.NumField())
	foreachRelevantField(t, func(f reflect.StructField) {
		column := reflect.MakeSlice(reflect.SliceOf(f.Type), len(l), len(l))
		cv.FieldByName(f.Name).Set(column)
		columns[f.Name] = column
	})
	if len(columns) == 0 {
		zero := reflect.New(t).Elem().Interface()
		return nil, fmt.Errorf("%[1]T has no exported fields", zero)
	}

	for idx, elem := range l {
		v := reflect.ValueOf(elem)
		for v.Kind() == reflect.Pointer {
			v = v.Elem()
		}
		foreachRelevantField(t, func(f reflect.StructField) {
			columns[f.Name].Index(idx).Set(v.FieldByIndex(f.Index))
		})
	}

	return json.Marshal(cv.Interface())
}

// UnmarshalJSON implements the [json.Unmarshaler] interface.
func (l *List[T]) UnmarshalJSON(buf []byte) error {
	t := reflect.TypeFor[T]()
	for t.Kind() == reflect.Pointer {
		t = t.Elem()
	}
	ct := getColumnarType(t)
	cv := reflect.New(ct)

	err := json.Unmarshal(buf, cv.Interface())
	if err != nil {
		return err
	}
	cv = cv.Elem()

	columns := make(map[string]reflect.Value, t.NumField())
	lengths := make(map[int]int)
	foreachRelevantField(t, func(f reflect.StructField) {
		column := cv.FieldByName(f.Name)
		columns[f.Name] = column
		lengths[column.Len()]++
	})

	switch len(lengths) {
	case 0:
		zero := reflect.New(t).Elem().Interface()
		return fmt.Errorf("%[1]T has no exported fields", zero)
	case 1:
		for length := range lengths {
			*l = make(List[T], length)
			break
		}
	default:
		return fmt.Errorf("cannot unmarshal from columns with inconsistent lengths %v", slices.Sorted(maps.Keys(lengths)))
	}

	for idx := range *l {
		v := reflect.ValueOf(&(*l)[idx]).Elem()
		for v.Kind() == reflect.Pointer {
			v.Set(reflect.New(v.Type().Elem()))
			v = v.Elem()
		}
		foreachRelevantField(t, func(f reflect.StructField) {
			v.FieldByIndex(f.Index).Set(columns[f.Name].Index(idx))
		})
	}
	return nil
}