Larger corpus, dictionary optimization

This commit is contained in:
Ian Gulliver
2023-12-31 20:37:58 -08:00
parent 01731ffb4f
commit 8905e150d7
4 changed files with 100106 additions and 5107 deletions

View File

@@ -1,7 +1,8 @@
package coding_test package coding_test
import ( import (
"bufio" "encoding/csv"
"io"
"os" "os"
"testing" "testing"
@@ -17,16 +18,22 @@ func TestSimple(t *testing.T) {
} }
func TestSMS(t *testing.T) { func TestSMS(t *testing.T) {
fh := lo.Must(os.Open("sms.txt")) fh := lo.Must(os.Open("text.csv"))
defer fh.Close() defer fh.Close()
s := bufio.NewScanner(fh) r := csv.NewReader(fh)
orig := 0 orig := 0
encoded := 0 encoded := 0
for s.Scan() { for true {
msg := s.Bytes() row, err := r.Read()
if err == io.EOF {
break
} else if err != nil {
t.Fatal(err)
}
msg := []byte(row[0])
e := coding.Encode(seeds.ChatState(), msg) e := coding.Encode(seeds.ChatState(), msg)
orig += len(msg) orig += len(msg)
encoded += len(e) encoded += len(e)

View File

@@ -1,8 +1,9 @@
package main package main
import ( import (
"bufio"
"bytes" "bytes"
"encoding/csv"
"io"
"log" "log"
"os" "os"
"slices" "slices"
@@ -20,16 +21,18 @@ func main() {
return len(sample) return len(sample)
})) }))
dict := buildDictionary(samples, 1024)
def := state.NewState() def := state.NewState()
log.Printf("def=%d [%s]", totalLength(def, samples), def) log.Printf("def=%d {%s}", totalLength(def, samples), def)
chat := seeds.ChatState() chat := seeds.ChatState()
log.Printf("chat=%d [%s]", totalLength(chat, samples), chat) log.Printf("chat=%d {%s}", totalLength(chat, samples), chat)
opt := optimize(state.NewState(), samples) words := buildDictionary(samples, 1024)
log.Printf("opt=%d [%s]", totalLength(opt, samples), opt) dict := optimizeDict(state.NewState(), samples, words)
log.Printf("dict=%d {%#U}", totalLength(dict, samples), dict.Symbols()[256:])
opt := optimize(dict, samples)
log.Printf("opt=%d {%s}", totalLength(opt, samples), opt)
} }
type pair struct { type pair struct {
@@ -43,7 +46,7 @@ func buildDictionary(samples [][]byte, num int) [][]byte {
for _, sample := range samples { for _, sample := range samples {
for i := 0; i < len(sample); i++ { for i := 0; i < len(sample); i++ {
sub := sample[i:] sub := sample[i:]
for j := 2; j < min(5, len(sub)); j++ { for j := 2; j < min(9, len(sub)); j++ {
sub2 := sub[:j] sub2 := sub[:j]
k := toUint64(sub2) k := toUint64(sub2)
@@ -92,8 +95,69 @@ func (p pair) score() int {
return p.count * ((len(p.symbol) * 8) - 11) return p.count * ((len(p.symbol) * 8) - 11)
} }
type sampleResult struct {
symbol []byte
state *state.State
score int
}
func optimizeDict(st *state.State, samples [][]byte, dict [][]byte) *state.State {
log.Printf("optDict:")
for len(dict) > 0 {
better := optimizeDict2(st, samples, dict)
if better == nil {
return st
}
st = better.state
log.Printf("\titer=%d {%#U}", totalLength(st, samples), better.symbol)
dict2 := [][]byte{}
for _, symbol := range dict {
if !bytes.Equal(symbol, better.symbol) {
dict2 = append(dict2, symbol)
}
}
dict = dict2
}
return st
}
func optimizeDict2(baseState *state.State, samples [][]byte, dict [][]byte) *sampleResult {
ch := make(chan *sampleResult, 100)
for _, symbol := range dict {
res := &sampleResult{
symbol: symbol,
}
go func() {
res.state = baseState.Clone()
res.state.AddSymbol(res.symbol)
res.score = totalLength(res.state, samples)
ch <- res
}()
}
results := []*sampleResult{}
for _ = range dict {
results = append(results, <-ch)
}
slices.SortFunc(results, func(a, b *sampleResult) int { return bytes.Compare(a.symbol, b.symbol) })
best := slices.MaxFunc(results, func(a, b *sampleResult) int { return b.score - a.score })
if best.score >= totalLength(baseState, samples) {
return nil
}
return best
}
func optimize(st *state.State, samples [][]byte) *state.State { func optimize(st *state.State, samples [][]byte) *state.State {
st.AddSymbol([]byte("it ")) log.Printf("opt:")
for true { for true {
better := optimize2(st, samples) better := optimize2(st, samples)
@@ -101,46 +165,39 @@ func optimize(st *state.State, samples [][]byte) *state.State {
return st return st
} }
st = better st = better
log.Printf("\titer=%d [%s]", totalLength(st, samples), st) log.Printf("\titer=%d {%s}", totalLength(st, samples), st)
} }
return st return st
} }
type sampleResult struct {
symbol []byte
state *state.State
score int
}
func optimize2(baseState *state.State, samples [][]byte) *state.State { func optimize2(baseState *state.State, samples [][]byte) *state.State {
ch := make(chan sampleResult, 100) ch := make(chan *sampleResult, 100)
symbols := baseState.Symbols() symbols := baseState.Symbols()
for _, symbol := range symbols { for _, symbol := range symbols {
res := sampleResult{ res := &sampleResult{
symbol: symbol, symbol: symbol,
} }
go func() { go func() {
st := baseState.Clone() res.state = baseState.Clone()
st.IncrementSymbol(res.symbol) res.state.IncrementSymbol(res.symbol)
res.state = st res.score = totalLength(res.state, samples)
res.score = totalLength(st, samples)
ch <- res ch <- res
}() }()
} }
results := []sampleResult{} results := []*sampleResult{}
for _ = range symbols { for _ = range symbols {
results = append(results, <-ch) results = append(results, <-ch)
} }
slices.SortFunc(results, func(a, b sampleResult) int { return bytes.Compare(a.symbol, b.symbol) }) slices.SortFunc(results, func(a, b *sampleResult) int { return bytes.Compare(a.symbol, b.symbol) })
best := slices.MaxFunc(results, func(a, b sampleResult) int { return b.score - a.score }) best := slices.MaxFunc(results, func(a, b *sampleResult) int { return b.score - a.score })
if best.score == totalLength(baseState, samples) { if best.score >= totalLength(baseState, samples) {
return nil return nil
} }
@@ -154,18 +211,25 @@ func totalLength(st *state.State, samples [][]byte) int {
} }
func loadSamples() ([][]byte, error) { func loadSamples() ([][]byte, error) {
fh, err := os.Open("sms.txt") fh, err := os.Open("text.csv")
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer fh.Close() defer fh.Close()
s := bufio.NewScanner(fh) r := csv.NewReader(fh)
ret := [][]byte{} ret := [][]byte{}
for s.Scan() { for true {
ret = append(ret, s.Bytes()) row, err := r.Read()
if err == io.EOF {
break
} else if err != nil {
return nil, err
}
ret = append(ret, []byte(row[0]))
} }
return ret, nil return ret, nil

5072
sms.txt

File diff suppressed because it is too large Load Diff

100000
text.csv Normal file

File diff suppressed because it is too large Load Diff