Larger corpus, dictionary optimization

This commit is contained in:
Ian Gulliver
2023-12-31 20:37:58 -08:00
parent 01731ffb4f
commit 8905e150d7
4 changed files with 100106 additions and 5107 deletions
+12 -5
View File
@@ -1,7 +1,8 @@
package coding_test package coding_test
import ( import (
"bufio" "encoding/csv"
"io"
"os" "os"
"testing" "testing"
@@ -17,16 +18,22 @@ func TestSimple(t *testing.T) {
} }
func TestSMS(t *testing.T) { func TestSMS(t *testing.T) {
fh := lo.Must(os.Open("sms.txt")) fh := lo.Must(os.Open("text.csv"))
defer fh.Close() defer fh.Close()
s := bufio.NewScanner(fh) r := csv.NewReader(fh)
orig := 0 orig := 0
encoded := 0 encoded := 0
for s.Scan() { for true {
msg := s.Bytes() row, err := r.Read()
if err == io.EOF {
break
} else if err != nil {
t.Fatal(err)
}
msg := []byte(row[0])
e := coding.Encode(seeds.ChatState(), msg) e := coding.Encode(seeds.ChatState(), msg)
orig += len(msg) orig += len(msg)
encoded += len(e) encoded += len(e)
+94 -30
View File
@@ -1,8 +1,9 @@
package main package main
import ( import (
"bufio"
"bytes" "bytes"
"encoding/csv"
"io"
"log" "log"
"os" "os"
"slices" "slices"
@@ -20,16 +21,18 @@ func main() {
return len(sample) return len(sample)
})) }))
dict := buildDictionary(samples, 1024)
def := state.NewState() def := state.NewState()
log.Printf("def=%d [%s]", totalLength(def, samples), def) log.Printf("def=%d {%s}", totalLength(def, samples), def)
chat := seeds.ChatState() chat := seeds.ChatState()
log.Printf("chat=%d [%s]", totalLength(chat, samples), chat) log.Printf("chat=%d {%s}", totalLength(chat, samples), chat)
opt := optimize(state.NewState(), samples) words := buildDictionary(samples, 1024)
log.Printf("opt=%d [%s]", totalLength(opt, samples), opt) dict := optimizeDict(state.NewState(), samples, words)
log.Printf("dict=%d {%#U}", totalLength(dict, samples), dict.Symbols()[256:])
opt := optimize(dict, samples)
log.Printf("opt=%d {%s}", totalLength(opt, samples), opt)
} }
type pair struct { type pair struct {
@@ -43,7 +46,7 @@ func buildDictionary(samples [][]byte, num int) [][]byte {
for _, sample := range samples { for _, sample := range samples {
for i := 0; i < len(sample); i++ { for i := 0; i < len(sample); i++ {
sub := sample[i:] sub := sample[i:]
for j := 2; j < min(5, len(sub)); j++ { for j := 2; j < min(9, len(sub)); j++ {
sub2 := sub[:j] sub2 := sub[:j]
k := toUint64(sub2) k := toUint64(sub2)
@@ -92,8 +95,69 @@ func (p pair) score() int {
return p.count * ((len(p.symbol) * 8) - 11) return p.count * ((len(p.symbol) * 8) - 11)
} }
type sampleResult struct {
symbol []byte
state *state.State
score int
}
func optimizeDict(st *state.State, samples [][]byte, dict [][]byte) *state.State {
log.Printf("optDict:")
for len(dict) > 0 {
better := optimizeDict2(st, samples, dict)
if better == nil {
return st
}
st = better.state
log.Printf("\titer=%d {%#U}", totalLength(st, samples), better.symbol)
dict2 := [][]byte{}
for _, symbol := range dict {
if !bytes.Equal(symbol, better.symbol) {
dict2 = append(dict2, symbol)
}
}
dict = dict2
}
return st
}
func optimizeDict2(baseState *state.State, samples [][]byte, dict [][]byte) *sampleResult {
ch := make(chan *sampleResult, 100)
for _, symbol := range dict {
res := &sampleResult{
symbol: symbol,
}
go func() {
res.state = baseState.Clone()
res.state.AddSymbol(res.symbol)
res.score = totalLength(res.state, samples)
ch <- res
}()
}
results := []*sampleResult{}
for _ = range dict {
results = append(results, <-ch)
}
slices.SortFunc(results, func(a, b *sampleResult) int { return bytes.Compare(a.symbol, b.symbol) })
best := slices.MaxFunc(results, func(a, b *sampleResult) int { return b.score - a.score })
if best.score >= totalLength(baseState, samples) {
return nil
}
return best
}
func optimize(st *state.State, samples [][]byte) *state.State { func optimize(st *state.State, samples [][]byte) *state.State {
st.AddSymbol([]byte("it ")) log.Printf("opt:")
for true { for true {
better := optimize2(st, samples) better := optimize2(st, samples)
@@ -101,46 +165,39 @@ func optimize(st *state.State, samples [][]byte) *state.State {
return st return st
} }
st = better st = better
log.Printf("\titer=%d [%s]", totalLength(st, samples), st) log.Printf("\titer=%d {%s}", totalLength(st, samples), st)
} }
return st return st
} }
type sampleResult struct {
symbol []byte
state *state.State
score int
}
func optimize2(baseState *state.State, samples [][]byte) *state.State { func optimize2(baseState *state.State, samples [][]byte) *state.State {
ch := make(chan sampleResult, 100) ch := make(chan *sampleResult, 100)
symbols := baseState.Symbols() symbols := baseState.Symbols()
for _, symbol := range symbols { for _, symbol := range symbols {
res := sampleResult{ res := &sampleResult{
symbol: symbol, symbol: symbol,
} }
go func() { go func() {
st := baseState.Clone() res.state = baseState.Clone()
st.IncrementSymbol(res.symbol) res.state.IncrementSymbol(res.symbol)
res.state = st res.score = totalLength(res.state, samples)
res.score = totalLength(st, samples)
ch <- res ch <- res
}() }()
} }
results := []sampleResult{} results := []*sampleResult{}
for _ = range symbols { for _ = range symbols {
results = append(results, <-ch) results = append(results, <-ch)
} }
slices.SortFunc(results, func(a, b sampleResult) int { return bytes.Compare(a.symbol, b.symbol) }) slices.SortFunc(results, func(a, b *sampleResult) int { return bytes.Compare(a.symbol, b.symbol) })
best := slices.MaxFunc(results, func(a, b sampleResult) int { return b.score - a.score }) best := slices.MaxFunc(results, func(a, b *sampleResult) int { return b.score - a.score })
if best.score == totalLength(baseState, samples) { if best.score >= totalLength(baseState, samples) {
return nil return nil
} }
@@ -154,18 +211,25 @@ func totalLength(st *state.State, samples [][]byte) int {
} }
func loadSamples() ([][]byte, error) { func loadSamples() ([][]byte, error) {
fh, err := os.Open("sms.txt") fh, err := os.Open("text.csv")
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer fh.Close() defer fh.Close()
s := bufio.NewScanner(fh) r := csv.NewReader(fh)
ret := [][]byte{} ret := [][]byte{}
for s.Scan() { for true {
ret = append(ret, s.Bytes()) row, err := r.Read()
if err == io.EOF {
break
} else if err != nil {
return nil, err
}
ret = append(ret, []byte(row[0]))
} }
return ret, nil return ret, nil
-5072
View File
File diff suppressed because it is too large Load Diff
+100000
View File
File diff suppressed because it is too large Load Diff