2023-12-29 20:48:12 -07:00
|
|
|
package main
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"bufio"
|
2023-12-31 16:09:35 -08:00
|
|
|
"bytes"
|
2023-12-29 20:48:12 -07:00
|
|
|
"log"
|
|
|
|
|
"os"
|
2023-12-30 20:37:18 -08:00
|
|
|
"slices"
|
2023-12-29 20:48:12 -07:00
|
|
|
|
|
|
|
|
"github.com/samber/lo"
|
|
|
|
|
"github.com/securemesh/coding"
|
|
|
|
|
"github.com/securemesh/coding/seeds"
|
2023-12-31 16:09:35 -08:00
|
|
|
"github.com/securemesh/coding/state"
|
2023-12-29 20:48:12 -07:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
func main() {
|
|
|
|
|
samples := lo.Must(loadSamples())
|
|
|
|
|
|
|
|
|
|
log.Printf("orig=%d", lo.SumBy(samples, func(sample []byte) int {
|
|
|
|
|
return len(sample)
|
|
|
|
|
}))
|
|
|
|
|
|
2023-12-31 19:24:17 -08:00
|
|
|
dict := buildDictionary(samples, 1024)
|
|
|
|
|
|
2023-12-30 20:14:01 -07:00
|
|
|
def := state.NewState()
|
2023-12-30 15:51:29 -07:00
|
|
|
log.Printf("def=%d [%s]", totalLength(def, samples), def)
|
|
|
|
|
|
2023-12-30 20:14:01 -07:00
|
|
|
chat := seeds.ChatState()
|
2023-12-30 15:51:29 -07:00
|
|
|
log.Printf("chat=%d [%s]", totalLength(chat, samples), chat)
|
2023-12-29 20:48:12 -07:00
|
|
|
|
2023-12-30 20:14:01 -07:00
|
|
|
opt := optimize(state.NewState(), samples)
|
2023-12-29 20:48:12 -07:00
|
|
|
log.Printf("opt=%d [%s]", totalLength(opt, samples), opt)
|
|
|
|
|
}
|
|
|
|
|
|
2023-12-31 19:24:17 -08:00
|
|
|
type pair struct {
|
|
|
|
|
symbol []byte
|
|
|
|
|
count int
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func buildDictionary(samples [][]byte, num int) [][]byte {
|
|
|
|
|
counts := map[uint64]*pair{}
|
|
|
|
|
|
|
|
|
|
for _, sample := range samples {
|
|
|
|
|
for i := 0; i < len(sample); i++ {
|
|
|
|
|
sub := sample[i:]
|
|
|
|
|
for j := 2; j < min(5, len(sub)); j++ {
|
|
|
|
|
sub2 := sub[:j]
|
|
|
|
|
k := toUint64(sub2)
|
|
|
|
|
|
|
|
|
|
p := counts[k]
|
|
|
|
|
if p == nil {
|
|
|
|
|
counts[k] = &pair{
|
|
|
|
|
symbol: sub2,
|
|
|
|
|
count: 1,
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
p.count++
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pairs := []*pair{}
|
|
|
|
|
|
|
|
|
|
for _, p := range counts {
|
|
|
|
|
pairs = append(pairs, p)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
slices.SortFunc(pairs, func(a, b *pair) int { return bytes.Compare(a.symbol, b.symbol) })
|
|
|
|
|
slices.SortStableFunc(pairs, func(a, b *pair) int { return b.score() - a.score() })
|
|
|
|
|
|
|
|
|
|
ret := [][]byte{}
|
|
|
|
|
|
|
|
|
|
for i := 0; i < num && i < len(pairs); i++ {
|
|
|
|
|
ret = append(ret, pairs[i].symbol)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return ret
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func toUint64(bs []byte) uint64 {
|
|
|
|
|
var ret uint64
|
|
|
|
|
|
|
|
|
|
for _, b := range bs {
|
|
|
|
|
ret = (ret << 8) | uint64(b)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return ret
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (p pair) score() int {
|
|
|
|
|
return p.count * ((len(p.symbol) * 8) - 11)
|
|
|
|
|
}
|
|
|
|
|
|
2023-12-30 20:14:01 -07:00
|
|
|
func optimize(st *state.State, samples [][]byte) *state.State {
|
2023-12-31 16:09:35 -08:00
|
|
|
st.AddSymbol([]byte("it "))
|
|
|
|
|
|
2023-12-29 20:48:12 -07:00
|
|
|
for true {
|
2023-12-30 20:14:01 -07:00
|
|
|
better := optimize2(st, samples)
|
2023-12-29 20:48:12 -07:00
|
|
|
if better == nil {
|
2023-12-30 21:00:37 -07:00
|
|
|
return st
|
2023-12-29 20:48:12 -07:00
|
|
|
}
|
2023-12-30 20:14:01 -07:00
|
|
|
st = better
|
|
|
|
|
log.Printf("\titer=%d [%s]", totalLength(st, samples), st)
|
2023-12-29 20:48:12 -07:00
|
|
|
}
|
|
|
|
|
|
2023-12-30 20:14:01 -07:00
|
|
|
return st
|
2023-12-29 20:48:12 -07:00
|
|
|
}
|
|
|
|
|
|
2023-12-29 22:02:07 -07:00
|
|
|
type sampleResult struct {
|
2023-12-31 16:09:35 -08:00
|
|
|
symbol []byte
|
|
|
|
|
state *state.State
|
|
|
|
|
score int
|
2023-12-29 22:02:07 -07:00
|
|
|
}
|
|
|
|
|
|
2023-12-30 20:14:01 -07:00
|
|
|
func optimize2(baseState *state.State, samples [][]byte) *state.State {
|
2023-12-29 22:02:07 -07:00
|
|
|
ch := make(chan sampleResult, 100)
|
2023-12-31 16:09:35 -08:00
|
|
|
symbols := baseState.Symbols()
|
2023-12-29 22:02:07 -07:00
|
|
|
|
2023-12-31 16:09:35 -08:00
|
|
|
for _, symbol := range symbols {
|
2023-12-30 20:37:18 -08:00
|
|
|
res := sampleResult{
|
2023-12-31 16:09:35 -08:00
|
|
|
symbol: symbol,
|
2023-12-30 20:37:18 -08:00
|
|
|
}
|
|
|
|
|
|
2023-12-31 16:09:35 -08:00
|
|
|
go func() {
|
2023-12-30 20:14:01 -07:00
|
|
|
st := baseState.Clone()
|
2023-12-30 20:37:18 -08:00
|
|
|
st.IncrementSymbol(res.symbol)
|
|
|
|
|
res.state = st
|
|
|
|
|
res.score = totalLength(st, samples)
|
|
|
|
|
ch <- res
|
2023-12-29 22:02:07 -07:00
|
|
|
}()
|
|
|
|
|
}
|
|
|
|
|
|
2023-12-30 20:37:18 -08:00
|
|
|
results := []sampleResult{}
|
2023-12-29 20:48:12 -07:00
|
|
|
|
2023-12-31 16:09:35 -08:00
|
|
|
for _ = range symbols {
|
2023-12-30 20:37:18 -08:00
|
|
|
results = append(results, <-ch)
|
|
|
|
|
}
|
|
|
|
|
|
2023-12-31 16:09:35 -08:00
|
|
|
slices.SortFunc(results, func(a, b sampleResult) int { return bytes.Compare(a.symbol, b.symbol) })
|
2023-12-30 20:37:18 -08:00
|
|
|
best := slices.MaxFunc(results, func(a, b sampleResult) int { return b.score - a.score })
|
|
|
|
|
|
|
|
|
|
if best.score == totalLength(baseState, samples) {
|
|
|
|
|
return nil
|
2023-12-29 20:48:12 -07:00
|
|
|
}
|
|
|
|
|
|
2023-12-30 20:37:18 -08:00
|
|
|
return best.state
|
2023-12-29 20:48:12 -07:00
|
|
|
}
|
|
|
|
|
|
2023-12-30 20:14:01 -07:00
|
|
|
func totalLength(st *state.State, samples [][]byte) int {
|
2023-12-29 20:48:12 -07:00
|
|
|
return lo.SumBy(samples, func(sample []byte) int {
|
2023-12-30 20:14:01 -07:00
|
|
|
return len(coding.Encode(st.Clone(), sample))
|
2023-12-29 20:48:12 -07:00
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func loadSamples() ([][]byte, error) {
|
|
|
|
|
fh, err := os.Open("sms.txt")
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, err
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
defer fh.Close()
|
|
|
|
|
|
|
|
|
|
s := bufio.NewScanner(fh)
|
|
|
|
|
ret := [][]byte{}
|
|
|
|
|
|
|
|
|
|
for s.Scan() {
|
|
|
|
|
ret = append(ret, s.Bytes())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return ret, nil
|
|
|
|
|
}
|