Working data structure, optimal seed

This commit is contained in:
Ian Gulliver
2023-12-30 20:14:01 -07:00
parent ac2dfbb5d1
commit 780a9b599b
6 changed files with 140 additions and 132 deletions

View File

@@ -12,7 +12,7 @@ import (
func TestSimple(t *testing.T) {
msg := []byte("this is a test. this is only a test.")
encoded := coding.Encode(seeds.ChatHeap(), msg)
encoded := coding.Encode(seeds.ChatState(), msg)
t.Logf("orig=%d encoded=%d", len(msg), len(encoded))
}
@@ -27,7 +27,7 @@ func TestSMS(t *testing.T) {
for s.Scan() {
msg := s.Bytes()
e := coding.Encode(seeds.ChatHeap(), msg)
e := coding.Encode(seeds.ChatState(), msg)
orig += len(msg)
encoded += len(e)
}

View File

@@ -6,15 +6,15 @@ import (
"github.com/icza/bitio"
"github.com/samber/lo"
"github.com/securemesh/coding/codes"
"github.com/securemesh/coding/heap"
"github.com/securemesh/coding/state"
)
func Encode(h *heap.Heap, msg []byte) []byte {
func Encode(st *state.State, msg []byte) []byte {
buf := &bytes.Buffer{}
w := bitio.NewWriter(buf)
for _, b := range msg {
index := h.IncrementSymbol(b)
index := st.IncrementSymbol(b)
code := codes.CodeForIndex(index)
lo.Must0(w.WriteBits(uint64(code.Value), uint8(code.Bits)))
}

View File

@@ -7,7 +7,7 @@ import (
"github.com/samber/lo"
"github.com/securemesh/coding"
"github.com/securemesh/coding/heap"
"github.com/securemesh/coding/state"
"github.com/securemesh/coding/seeds"
)
@@ -18,56 +18,66 @@ func main() {
return len(sample)
}))
def := heap.NewHeap()
def := state.NewState()
log.Printf("def=%d [%s]", totalLength(def, samples), def)
chat := seeds.ChatHeap()
chat := seeds.ChatState()
log.Printf("chat=%d [%s]", totalLength(chat, samples), chat)
opt := optimize(heap.NewHeap(), samples)
chatOpt := optimize(chat, samples)
if chatOpt == nil {
log.Printf("\toptimal from further additions")
} else {
log.Printf("\tnot optimal [%s]", chatOpt)
}
opt := optimize(state.NewState(), samples)
log.Printf("opt=%d [%s]", totalLength(opt, samples), opt)
}
func optimize(h *heap.Heap, samples [][]byte) *heap.Heap {
func optimize(st *state.State, samples [][]byte) *state.State {
var best *state.State
for true {
better := optimize2(h, samples)
better := optimize2(st, samples)
if better == nil {
return h
return best
}
h = better
log.Printf("\titer=%d [%s]", totalLength(h, samples), h)
best = better
st = better
log.Printf("\titer=%d [%s]", totalLength(st, samples), st)
}
return h
return st
}
type sampleResult struct {
heap *heap.Heap
state *state.State
score int
}
func optimize2(baseHeap *heap.Heap, samples [][]byte) *heap.Heap {
func optimize2(baseState *state.State, samples [][]byte) *state.State {
ch := make(chan sampleResult, 100)
for i := 0; i < 256; i++ {
s := byte(i)
go func () {
h := baseHeap.Clone()
h.IncrementSymbol(s)
st := baseState.Clone()
st.IncrementSymbol(s)
ch <- sampleResult{
heap: h,
score: totalLength(h, samples),
state: st,
score: totalLength(st, samples),
}
}()
}
var best *heap.Heap = nil
bestScore := totalLength(baseHeap, samples)
var best *state.State = nil
bestScore := totalLength(baseState, samples)
for i := 0; i < 256; i++ {
res := <-ch
if res.score < bestScore {
best = res.heap
best = res.state
bestScore = res.score
}
}
@@ -75,9 +85,9 @@ func optimize2(baseHeap *heap.Heap, samples [][]byte) *heap.Heap {
return best
}
func totalLength(heap *heap.Heap, samples [][]byte) int {
func totalLength(st *state.State, samples [][]byte) int {
return lo.SumBy(samples, func(sample []byte) int {
return len(coding.Encode(heap.Clone(), sample))
return len(coding.Encode(st.Clone(), sample))
})
}

View File

@@ -1,86 +0,0 @@
package heap
import (
"fmt"
"maps"
"slices"
"strings"
)
type node struct {
symbol byte
count int
}
type Heap struct {
nodes [256]node
bySymbol map[byte]int
}
func NewHeap() *Heap {
h := &Heap{
bySymbol: map[byte]int{},
}
for i := 0; i < 256; i++ {
h.nodes[i].symbol = byte(i)
h.bySymbol[byte(i)] = i
}
return h
}
func (h Heap) Clone() *Heap {
return &Heap{
nodes: h.nodes,
bySymbol: maps.Clone(h.bySymbol),
}
}
func (h *Heap) IncrementSymbol(symbol byte) int {
nodeIndex := h.bySymbol[symbol]
h.nodes[nodeIndex].count++
iterIndex := nodeIndex
for iterIndex != 0 {
parentIndex := h.parentIndex(iterIndex)
if h.nodes[iterIndex].count < h.nodes[parentIndex].count || (h.nodes[iterIndex].count == h.nodes[parentIndex].count && h.nodes[iterIndex].symbol > h.nodes[parentIndex].symbol) {
break
}
h.nodes[iterIndex], h.nodes[parentIndex] = h.nodes[parentIndex], h.nodes[iterIndex]
h.bySymbol[h.nodes[iterIndex].symbol] = iterIndex
h.bySymbol[h.nodes[parentIndex].symbol] = parentIndex
iterIndex = parentIndex
}
return nodeIndex
}
func (h Heap) String() string {
nodes := []node{}
for _, node := range h.nodes {
if node.count == 0 {
continue
}
nodes = append(nodes, node)
}
slices.SortStableFunc(nodes, func(a, b node) int { return int(a.symbol) - int(b.symbol) })
slices.SortStableFunc(nodes, func(a, b node) int { return a.count - b.count })
strs := []string{}
for _, node := range nodes {
strs = append(strs, fmt.Sprintf("{%#U}=%d", node.symbol, node.count))
}
return strings.Join(strs, ", ")
}
func (h Heap) parentIndex(nodeIndex int) int {
return (nodeIndex - 1) / 2
}

View File

@@ -1,41 +1,39 @@
package seeds
import (
"github.com/securemesh/coding/heap"
"github.com/securemesh/coding/state"
)
var chatHeap = newHeapFromSeed([][]byte{
/* 01 */ []byte("\x07'(,-8?ACDFHJLMNPRSTUWYbcfgjkpxzê"),
/* 02 */ []byte("\n.dvw"),
/* 03 */ []byte("Ihlmor"),
/* 04 */ []byte("nu"),
/* 05 */ []byte("ey"),
/* 06 */ []byte("i"),
/* 07 */ []byte("s"),
/* 08 */ []byte(""),
var chatState = newStateFromSeed([][]byte{
/* 01 */ []byte("',.0:?CIbgjkpvxz\xea"),
/* 02 */ []byte("\nfw"),
/* 03 */ []byte("cdmuy"),
/* 04 */ []byte("l"),
/* 05 */ []byte("r"),
/* 06 */ []byte("t"),
/* 07 */ []byte("ahos"),
/* 08 */ []byte("in"),
/* 09 */ []byte(""),
/* 10 */ []byte(""),
/* 11 */ []byte("at"),
/* 11 */ []byte(" "),
/* 12 */ []byte(""),
/* 13 */ []byte(""),
/* 14 */ []byte(""),
/* 15 */ []byte(" "),
/* 13 */ []byte("e"),
})
func ChatHeap() *heap.Heap {
return chatHeap.Clone()
func ChatState() *state.State {
return chatState.Clone()
}
func newHeapFromSeed(seed [][]byte) *heap.Heap {
h := heap.NewHeap()
func newStateFromSeed(seed [][]byte) *state.State {
st := state.NewState()
for i := range seed {
for _, s := range seed[i:] {
for _, b := range s {
h.IncrementSymbol(b)
st.IncrementSymbol(b)
}
}
}
return h
return st
}

86
state/state.go Normal file
View File

@@ -0,0 +1,86 @@
package state
import (
"fmt"
"maps"
"strings"
)
type node struct {
symbol byte
count int
}
type State struct {
nodes []*node
bySymbol map[byte]int
}
func NewState() *State {
st := &State{
bySymbol: map[byte]int{},
}
for i := 0; i < 256; i++ {
st.nodes = append(st.nodes, &node{
symbol: byte(i),
})
st.bySymbol[byte(i)] = i
}
return st
}
func (st State) Clone() *State {
st2 := &State{
bySymbol: maps.Clone(st.bySymbol),
}
for _, node := range st.nodes {
tmp := *node
st2.nodes = append(st2.nodes, &tmp)
}
return st2
}
// Returns old index
func (st *State) IncrementSymbol(symbol byte) int {
nodeIndex := st.bySymbol[symbol]
st.nodes[nodeIndex].count++
for iterIndex := nodeIndex; iterIndex > 0; iterIndex-- {
prevIndex := iterIndex - 1
iterNode := st.nodes[iterIndex]
prevNode := st.nodes[prevIndex]
if prevNode.count > iterNode.count {
break
} else if prevNode.count == iterNode.count && prevNode.symbol < iterNode.symbol {
break
}
st.nodes[iterIndex] = prevNode
st.bySymbol[prevNode.symbol] = iterIndex
st.nodes[prevIndex] = iterNode
st.bySymbol[iterNode.symbol] = prevIndex
}
return nodeIndex
}
func (st State) String() string {
strs := []string{}
for _, node := range st.nodes {
if node.count == 0 {
break
}
strs = append(strs, fmt.Sprintf("{%#U}=%d", node.symbol, node.count))
}
return strings.Join(strs, ", ")
}