diff --git a/codes/codes.go b/codes/codes.go index 5c349c7..6fbe5ac 100644 --- a/codes/codes.go +++ b/codes/codes.go @@ -1,271 +1,26 @@ package codes type Code struct { - Value uint16 - Bits int + Value uint32 + Bits uint8 } -var codes = []Code{ - {Value: 0b0000, Bits: 4}, - {Value: 0b0001, Bits: 4}, - {Value: 0b0010, Bits: 4}, - {Value: 0b0011, Bits: 4}, - {Value: 0b01000, Bits: 5}, - {Value: 0b01001, Bits: 5}, - {Value: 0b01010, Bits: 5}, - {Value: 0b01011, Bits: 5}, - {Value: 0b011000, Bits: 6}, - {Value: 0b011001, Bits: 6}, - {Value: 0b011010, Bits: 6}, - {Value: 0b011011, Bits: 6}, - {Value: 0b011100, Bits: 6}, - {Value: 0b011101, Bits: 6}, - {Value: 0b011110, Bits: 6}, - {Value: 0b011111, Bits: 6}, - {Value: 0b1000000, Bits: 7}, - {Value: 0b1000001, Bits: 7}, - {Value: 0b1000010, Bits: 7}, - {Value: 0b1000011, Bits: 7}, - {Value: 0b1000100, Bits: 7}, - {Value: 0b1000101, Bits: 7}, - {Value: 0b1000110, Bits: 7}, - {Value: 0b1000111, Bits: 7}, - {Value: 0b1001000, Bits: 7}, - {Value: 0b1001001, Bits: 7}, - {Value: 0b1001010, Bits: 7}, - {Value: 0b1001011, Bits: 7}, - {Value: 0b1001100, Bits: 7}, - {Value: 0b1001101, Bits: 7}, - {Value: 0b1001110, Bits: 7}, - {Value: 0b1001111, Bits: 7}, - {Value: 0b10100000, Bits: 8}, - {Value: 0b10100001, Bits: 8}, - {Value: 0b10100010, Bits: 8}, - {Value: 0b10100011, Bits: 8}, - {Value: 0b10100100, Bits: 8}, - {Value: 0b10100101, Bits: 8}, - {Value: 0b10100110, Bits: 8}, - {Value: 0b10100111, Bits: 8}, - {Value: 0b10101000, Bits: 8}, - {Value: 0b10101001, Bits: 8}, - {Value: 0b10101010, Bits: 8}, - {Value: 0b10101011, Bits: 8}, - {Value: 0b10101100, Bits: 8}, - {Value: 0b10101101, Bits: 8}, - {Value: 0b10101110, Bits: 8}, - {Value: 0b10101111, Bits: 8}, - {Value: 0b10110000, Bits: 8}, - {Value: 0b10110001, Bits: 8}, - {Value: 0b10110010, Bits: 8}, - {Value: 0b10110011, Bits: 8}, - {Value: 0b10110100, Bits: 8}, - {Value: 0b10110101, Bits: 8}, - {Value: 0b10110110, Bits: 8}, - {Value: 0b10110111, Bits: 8}, - {Value: 0b10111000, Bits: 8}, - {Value: 0b10111001, Bits: 8}, - {Value: 0b10111010, Bits: 8}, - {Value: 0b10111011, Bits: 8}, - {Value: 0b10111100, Bits: 8}, - {Value: 0b10111101, Bits: 8}, - {Value: 0b10111110, Bits: 8}, - {Value: 0b10111111, Bits: 8}, - {Value: 0b110000000, Bits: 9}, - {Value: 0b110000001, Bits: 9}, - {Value: 0b110000010, Bits: 9}, - {Value: 0b110000011, Bits: 9}, - {Value: 0b110000100, Bits: 9}, - {Value: 0b110000101, Bits: 9}, - {Value: 0b110000110, Bits: 9}, - {Value: 0b110000111, Bits: 9}, - {Value: 0b110001000, Bits: 9}, - {Value: 0b110001001, Bits: 9}, - {Value: 0b110001010, Bits: 9}, - {Value: 0b110001011, Bits: 9}, - {Value: 0b110001100, Bits: 9}, - {Value: 0b110001101, Bits: 9}, - {Value: 0b110001110, Bits: 9}, - {Value: 0b110001111, Bits: 9}, - {Value: 0b110010000, Bits: 9}, - {Value: 0b110010001, Bits: 9}, - {Value: 0b110010010, Bits: 9}, - {Value: 0b110010011, Bits: 9}, - {Value: 0b110010100, Bits: 9}, - {Value: 0b110010101, Bits: 9}, - {Value: 0b110010110, Bits: 9}, - {Value: 0b110010111, Bits: 9}, - {Value: 0b110011000, Bits: 9}, - {Value: 0b110011001, Bits: 9}, - {Value: 0b110011010, Bits: 9}, - {Value: 0b110011011, Bits: 9}, - {Value: 0b110011100, Bits: 9}, - {Value: 0b110011101, Bits: 9}, - {Value: 0b110011110, Bits: 9}, - {Value: 0b110011111, Bits: 9}, - {Value: 0b110100000, Bits: 9}, - {Value: 0b110100001, Bits: 9}, - {Value: 0b110100010, Bits: 9}, - {Value: 0b110100011, Bits: 9}, - {Value: 0b110100100, Bits: 9}, - {Value: 0b110100101, Bits: 9}, - {Value: 0b110100110, Bits: 9}, - {Value: 0b110100111, Bits: 9}, - {Value: 0b110101000, Bits: 9}, - {Value: 0b110101001, Bits: 9}, - {Value: 0b110101010, Bits: 9}, - {Value: 0b110101011, Bits: 9}, - {Value: 0b110101100, Bits: 9}, - {Value: 0b110101101, Bits: 9}, - {Value: 0b110101110, Bits: 9}, - {Value: 0b110101111, Bits: 9}, - {Value: 0b110110000, Bits: 9}, - {Value: 0b110110001, Bits: 9}, - {Value: 0b110110010, Bits: 9}, - {Value: 0b110110011, Bits: 9}, - {Value: 0b110110100, Bits: 9}, - {Value: 0b110110101, Bits: 9}, - {Value: 0b110110110, Bits: 9}, - {Value: 0b110110111, Bits: 9}, - {Value: 0b110111000, Bits: 9}, - {Value: 0b110111001, Bits: 9}, - {Value: 0b110111010, Bits: 9}, - {Value: 0b110111011, Bits: 9}, - {Value: 0b110111100, Bits: 9}, - {Value: 0b110111101, Bits: 9}, - {Value: 0b110111110, Bits: 9}, - {Value: 0b110111111, Bits: 9}, - {Value: 0b1110000000, Bits: 10}, - {Value: 0b1110000001, Bits: 10}, - {Value: 0b1110000010, Bits: 10}, - {Value: 0b1110000011, Bits: 10}, - {Value: 0b1110000100, Bits: 10}, - {Value: 0b1110000101, Bits: 10}, - {Value: 0b1110000110, Bits: 10}, - {Value: 0b1110000111, Bits: 10}, - {Value: 0b1110001000, Bits: 10}, - {Value: 0b1110001001, Bits: 10}, - {Value: 0b1110001010, Bits: 10}, - {Value: 0b1110001011, Bits: 10}, - {Value: 0b1110001100, Bits: 10}, - {Value: 0b1110001101, Bits: 10}, - {Value: 0b1110001110, Bits: 10}, - {Value: 0b1110001111, Bits: 10}, - {Value: 0b1110010000, Bits: 10}, - {Value: 0b1110010001, Bits: 10}, - {Value: 0b1110010010, Bits: 10}, - {Value: 0b1110010011, Bits: 10}, - {Value: 0b1110010100, Bits: 10}, - {Value: 0b1110010101, Bits: 10}, - {Value: 0b1110010110, Bits: 10}, - {Value: 0b1110010111, Bits: 10}, - {Value: 0b1110011000, Bits: 10}, - {Value: 0b1110011001, Bits: 10}, - {Value: 0b1110011010, Bits: 10}, - {Value: 0b1110011011, Bits: 10}, - {Value: 0b1110011100, Bits: 10}, - {Value: 0b1110011101, Bits: 10}, - {Value: 0b1110011110, Bits: 10}, - {Value: 0b1110011111, Bits: 10}, - {Value: 0b1110100000, Bits: 10}, - {Value: 0b1110100001, Bits: 10}, - {Value: 0b1110100010, Bits: 10}, - {Value: 0b1110100011, Bits: 10}, - {Value: 0b1110100100, Bits: 10}, - {Value: 0b1110100101, Bits: 10}, - {Value: 0b1110100110, Bits: 10}, - {Value: 0b1110100111, Bits: 10}, - {Value: 0b1110101000, Bits: 10}, - {Value: 0b1110101001, Bits: 10}, - {Value: 0b1110101010, Bits: 10}, - {Value: 0b1110101011, Bits: 10}, - {Value: 0b1110101100, Bits: 10}, - {Value: 0b1110101101, Bits: 10}, - {Value: 0b1110101110, Bits: 10}, - {Value: 0b1110101111, Bits: 10}, - {Value: 0b1110110000, Bits: 10}, - {Value: 0b1110110001, Bits: 10}, - {Value: 0b1110110010, Bits: 10}, - {Value: 0b1110110011, Bits: 10}, - {Value: 0b1110110100, Bits: 10}, - {Value: 0b1110110101, Bits: 10}, - {Value: 0b1110110110, Bits: 10}, - {Value: 0b1110110111, Bits: 10}, - {Value: 0b1110111000, Bits: 10}, - {Value: 0b1110111001, Bits: 10}, - {Value: 0b1110111010, Bits: 10}, - {Value: 0b1110111011, Bits: 10}, - {Value: 0b1110111100, Bits: 10}, - {Value: 0b1110111101, Bits: 10}, - {Value: 0b1110111110, Bits: 10}, - {Value: 0b1110111111, Bits: 10}, - {Value: 0b1111000000, Bits: 10}, - {Value: 0b1111000001, Bits: 10}, - {Value: 0b1111000010, Bits: 10}, - {Value: 0b1111000011, Bits: 10}, - {Value: 0b1111000100, Bits: 10}, - {Value: 0b1111000101, Bits: 10}, - {Value: 0b1111000110, Bits: 10}, - {Value: 0b1111000111, Bits: 10}, - {Value: 0b1111001000, Bits: 10}, - {Value: 0b1111001001, Bits: 10}, - {Value: 0b1111001010, Bits: 10}, - {Value: 0b1111001011, Bits: 10}, - {Value: 0b1111001100, Bits: 10}, - {Value: 0b1111001101, Bits: 10}, - {Value: 0b1111001110, Bits: 10}, - {Value: 0b1111001111, Bits: 10}, - {Value: 0b1111010000, Bits: 10}, - {Value: 0b1111010001, Bits: 10}, - {Value: 0b1111010010, Bits: 10}, - {Value: 0b1111010011, Bits: 10}, - {Value: 0b1111010100, Bits: 10}, - {Value: 0b1111010101, Bits: 10}, - {Value: 0b1111010110, Bits: 10}, - {Value: 0b1111010111, Bits: 10}, - {Value: 0b1111011000, Bits: 10}, - {Value: 0b1111011001, Bits: 10}, - {Value: 0b1111011010, Bits: 10}, - {Value: 0b1111011011, Bits: 10}, - {Value: 0b1111011100, Bits: 10}, - {Value: 0b1111011101, Bits: 10}, - {Value: 0b1111011110, Bits: 10}, - {Value: 0b1111011111, Bits: 10}, - {Value: 0b1111100000, Bits: 10}, - {Value: 0b1111100001, Bits: 10}, - {Value: 0b1111100010, Bits: 10}, - {Value: 0b1111100011, Bits: 10}, - {Value: 0b1111100100, Bits: 10}, - {Value: 0b1111100101, Bits: 10}, - {Value: 0b1111100110, Bits: 10}, - {Value: 0b1111100111, Bits: 10}, - {Value: 0b1111101000, Bits: 10}, - {Value: 0b1111101001, Bits: 10}, - {Value: 0b1111101010, Bits: 10}, - {Value: 0b1111101011, Bits: 10}, - {Value: 0b1111101100, Bits: 10}, - {Value: 0b1111101101, Bits: 10}, - {Value: 0b1111101110, Bits: 10}, - {Value: 0b1111101111, Bits: 10}, - {Value: 0b1111110000, Bits: 10}, - {Value: 0b1111110001, Bits: 10}, - {Value: 0b1111110010, Bits: 10}, - {Value: 0b1111110011, Bits: 10}, - {Value: 0b1111110100, Bits: 10}, - {Value: 0b1111110101, Bits: 10}, - {Value: 0b1111110110, Bits: 10}, - {Value: 0b1111110111, Bits: 10}, - {Value: 0b1111111000, Bits: 10}, - {Value: 0b1111111001, Bits: 10}, - {Value: 0b1111111010, Bits: 10}, - {Value: 0b1111111011, Bits: 10}, - {Value: 0b1111111100, Bits: 10}, - {Value: 0b1111111101, Bits: 10}, - {Value: 0b11111111100, Bits: 11}, - {Value: 0b11111111101, Bits: 11}, - {Value: 0b11111111110, Bits: 11}, - {Value: 0b11111111111, Bits: 11}, -} +func CodeForIndex(index uint32) Code { + switch { + case index < 4: + return Code{Value: index, Bits: 4} + case index < 8: + return Code{Value: 0b01000 + (index & 0b00011), Bits: 5} + case index < 16: + return Code{Value: 0b011000 + (index & 0b000111), Bits: 6} + case index < 32: + return Code{Value: 0b1000000 + (index & 0b0001111), Bits: 7} + default: + set := uint8(index / uint32(64)) -func CodeForIndex(index int) Code { - return codes[index] + return Code{ + Value: (((2 << set) - 1) << 7) + (index % uint32(64)), + Bits: set + 8, + } + } } diff --git a/encode.go b/encode.go index c466c5e..cbde1ba 100644 --- a/encode.go +++ b/encode.go @@ -16,7 +16,7 @@ func Encode(st *state.State, msg []byte) []byte { for i := 0; i < len(msg); { l, index := st.IncrementSymbol(msg[i:]) i += l - code := codes.CodeForIndex(index) + code := codes.CodeForIndex(uint32(index)) lo.Must0(w.WriteBits(uint64(code.Value), uint8(code.Bits))) } diff --git a/gencodes/main.go b/gencodes/main.go deleted file mode 100644 index eb1aa38..0000000 --- a/gencodes/main.go +++ /dev/null @@ -1,79 +0,0 @@ -package main - -import ( - "log" - "strings" -) - -func main() { - // Generate all possible values up to 10 bits - byLen := map[int][]string{ - 1: []string{"0", "1"}, - } - - for l := 2; l <= 10; l++ { - values := []string{} - - for _, v := range byLen[l-1] { - values = append(values, v+"0", v+"1") - } - - byLen[l] = values - } - - limits := map[int]int{ - 1: 0, - 2: 0, - 3: 0, - 4: 2, - 5: 8, - 6: 8, - 7: 16, - 8: 32, - 9: 64, - 10: 256, - } - - total := 0 - short := 0 - - for l := 1; l <= 10; l++ { - vs := byLen[l] - limit := limits[l] - values := []string{} - - valueLoop: - for _, v := range vs { - if limit == 0 { - break - } - - for i := 1; i < l; i++ { - for _, v2 := range byLen[i] { - if strings.HasPrefix(v, v2) { - continue valueLoop - } - } - } - - values = append(values, v) - limit-- - print(v + "\n") - } - - byLen[l] = values - - total += len(values) - if l < 8 { - short += len(values) - } - } - - for l := 1; l <= 10; l++ { - values := byLen[l] - log.Printf("%d: %d", l, len(values)) - } - - log.Printf("total=%d", total) - log.Printf("short=%d", short) -} diff --git a/genseed/genseed.go b/genseed/genseed.go index 7d2b56d..cd757d9 100644 --- a/genseed/genseed.go +++ b/genseed/genseed.go @@ -20,6 +20,8 @@ func main() { return len(sample) })) + dict := buildDictionary(samples, 1024) + def := state.NewState() log.Printf("def=%d [%s]", totalLength(def, samples), def) @@ -30,6 +32,66 @@ func main() { log.Printf("opt=%d [%s]", totalLength(opt, samples), opt) } +type pair struct { + symbol []byte + count int +} + +func buildDictionary(samples [][]byte, num int) [][]byte { + counts := map[uint64]*pair{} + + for _, sample := range samples { + for i := 0; i < len(sample); i++ { + sub := sample[i:] + for j := 2; j < min(5, len(sub)); j++ { + sub2 := sub[:j] + k := toUint64(sub2) + + p := counts[k] + if p == nil { + counts[k] = &pair{ + symbol: sub2, + count: 1, + } + } else { + p.count++ + } + } + } + } + + pairs := []*pair{} + + for _, p := range counts { + pairs = append(pairs, p) + } + + slices.SortFunc(pairs, func(a, b *pair) int { return bytes.Compare(a.symbol, b.symbol) }) + slices.SortStableFunc(pairs, func(a, b *pair) int { return b.score() - a.score() }) + + ret := [][]byte{} + + for i := 0; i < num && i < len(pairs); i++ { + ret = append(ret, pairs[i].symbol) + } + + return ret +} + +func toUint64(bs []byte) uint64 { + var ret uint64 + + for _, b := range bs { + ret = (ret << 8) | uint64(b) + } + + return ret +} + +func (p pair) score() int { + return p.count * ((len(p.symbol) * 8) - 11) +} + func optimize(st *state.State, samples [][]byte) *state.State { st.AddSymbol([]byte("it "))