Files

312 lines
8.8 KiB
Go
Raw Permalink Normal View History

// Package claude reads the weight off a scale photo using the Anthropic vision
// API. It owns credential resolution, prompting, the majority-vote logic, and
// response parsing — all the Claude-specific handling lives here.
package claude
import (
"bytes"
"encoding/base64"
"encoding/json"
"fmt"
"image"
"image/png"
"io"
"log/slog"
"net/http"
"os"
"os/exec"
"runtime"
"strings"
"sync"
"time"
xdraw "golang.org/x/image/draw"
)
// trace logs the start of an operation and, via the returned func, its end and
// duration. It is a no-op above debug level, so it only appears with -v.
func trace(op string) func() {
t0 := time.Now()
slog.Debug("begin", "op", op)
return func() { slog.Debug("end", "op", op, "dur", time.Since(t0)) }
}
// weightPrompt primes a general vision model to read the scale. The hint about
// the housing clipping the tops of the digits is what stops a 7 (which has a
// top bar) from being misread as a 1 on this kind of LCD.
const weightPrompt = `This photo shows a 3D printer filament spool sitting on a digital kitchen scale. ` +
`Read the weight on the scale's digital display, digit by digit, exactly as shown. ` +
`Report the unit shown on the display (g, kg, lb, or oz), and rate your own confidence ` +
`in the reading as "low", "medium", or "high". ` +
`Reason briefly, then on the LAST line output ONLY JSON: ` +
`{"weight": <number>, "unit": "<g|kg|lb|oz>", "confidence": "<low|medium|high>"}.`
// Reading is the result of reading a scale photo.
type Reading struct {
Weight float64 // winning weight value
Unit string // g, kg, lb or oz (lowercased)
Confidence float64 // vote agreement weighted by model self-confidence
Weights []float64 // each successful vote's numeric reading
ModelConfidences []string // each successful vote's self-rating, as the model worded it
}
// ReadWeight asks a vision model to read the display, taking the majority answer
// across a few independent reads for robustness. No image regions or digit-font
// geometry are assumed: the model does general OCR on the whole photo. The
// overall confidence is the fraction of reads that agreed with the winning
// answer, scaled by the average self-confidence the model reported for those
// agreeing reads (low/medium/high -> 1/3, 2/3, 3/3).
func ReadWeight(img image.Image, auth Auth) (Reading, error) {
defer trace("claude.readWeight")()
const votes = 3
type vote struct {
w float64
u string
conf string
}
type outcome struct {
v vote
err error
}
run := func() outcome {
w, u, c, e := readOnce(img, auth)
return outcome{vote{w, u, c}, e}
}
// Run the first call alone so it populates the prompt cache, then fire the
// remaining (independent) votes concurrently to reuse that cache.
outcomes := make([]outcome, votes)
outcomes[0] = run()
var wg sync.WaitGroup
for i := 1; i < votes; i++ {
wg.Add(1)
go func(i int) {
defer wg.Done()
outcomes[i] = run()
}(i)
}
wg.Wait()
var got []vote
var lastErr error
for _, o := range outcomes {
if o.err != nil {
lastErr = o.err
continue
}
got = append(got, o.v)
}
if len(got) == 0 {
return Reading{}, lastErr
}
key := func(w float64, u string) string { return fmt.Sprintf("%g|%s", w, u) }
tally := map[string]int{}
for _, v := range got {
tally[key(v.w, v.u)]++
}
bestKey, bestN := "", -1
for k, n := range tally {
if n > bestN {
bestKey, bestN = k, n
}
}
res := Reading{}
var winnerMC []float64
for _, v := range got {
res.Weights = append(res.Weights, v.w)
res.ModelConfidences = append(res.ModelConfidences, v.conf)
if key(v.w, v.u) == bestKey {
res.Weight, res.Unit = v.w, v.u
winnerMC = append(winnerMC, modelConfidence(v.conf))
}
}
agreement := float64(bestN) / float64(len(got))
res.Confidence = agreement * mean(winnerMC)
return res, nil
}
func mean(xs []float64) float64 {
if len(xs) == 0 {
return 0
}
var s float64
for _, x := range xs {
s += x
}
return s / float64(len(xs))
}
// modelConfidence maps the model's self-rating to a fraction.
func modelConfidence(s string) float64 {
switch strings.ToLower(strings.TrimSpace(s)) {
case "high":
return 3.0 / 3
case "medium":
return 2.0 / 3
default: // "low" or anything unexpected -> treat conservatively
return 1.0 / 3
}
}
func readOnce(img image.Image, auth Auth) (float64, string, string, error) {
defer trace("claude.vote")()
// Downscale only to satisfy the vision API's size limit — not a crop.
small := downscale(img, 1568)
var buf bytes.Buffer
if err := png.Encode(&buf, small); err != nil {
return 0, "", "", err
}
b64 := base64.StdEncoding.EncodeToString(buf.Bytes())
reqBody := map[string]interface{}{
"model": "claude-opus-4-8",
"max_tokens": 600,
"messages": []map[string]interface{}{{
"role": "user",
"content": []map[string]interface{}{
{"type": "image", "source": map[string]interface{}{"type": "base64", "media_type": "image/png", "data": b64}},
// Cache the image+prompt prefix so the repeated majority-vote calls
// (same image, same prompt) hit the cache instead of re-sending it.
{"type": "text", "text": weightPrompt, "cache_control": map[string]string{"type": "ephemeral"}},
},
}},
}
if auth.system != "" {
reqBody["system"] = auth.system
}
body, _ := json.Marshal(reqBody)
req, err := http.NewRequest("POST", "https://api.anthropic.com/v1/messages", bytes.NewReader(body))
if err != nil {
return 0, "", "", err
}
req.Header.Set("content-type", "application/json")
req.Header.Set("anthropic-version", "2023-06-01")
auth.apply(req)
resp, err := (&http.Client{Timeout: 90 * time.Second}).Do(req)
if err != nil {
return 0, "", "", err
}
defer resp.Body.Close()
respBody, _ := io.ReadAll(resp.Body)
if resp.StatusCode != 200 {
return 0, "", "", fmt.Errorf("API %d: %s", resp.StatusCode, strings.TrimSpace(string(respBody)))
}
var parsed struct {
Content []struct {
Text string `json:"text"`
} `json:"content"`
}
if err := json.Unmarshal(respBody, &parsed); err != nil {
return 0, "", "", err
}
var text string
for _, c := range parsed.Content {
text += c.Text
}
var wr struct {
Weight float64 `json:"weight"`
Unit string `json:"unit"`
Confidence string `json:"confidence"`
}
if err := json.Unmarshal([]byte(lastJSONObject(text)), &wr); err != nil {
return 0, "", "", fmt.Errorf("parse model reply %q: %v", text, err)
}
return wr.Weight, strings.ToLower(strings.TrimSpace(wr.Unit)), strings.TrimSpace(wr.Confidence), nil
}
// lastJSONObject returns the final {...} object in s (the model's answer line).
func lastJSONObject(s string) string {
j := strings.LastIndex(s, "}")
if j < 0 {
return s
}
depth := 0
for i := j; i >= 0; i-- {
switch s[i] {
case '}':
depth++
case '{':
depth--
if depth == 0 {
return s[i : j+1]
}
}
}
return s
}
func downscale(img image.Image, max int) image.Image {
b := img.Bounds()
w, h := b.Dx(), b.Dy()
if w <= max && h <= max {
return img
}
scale := float64(max) / float64(w)
if h > w {
scale = float64(max) / float64(h)
}
dst := image.NewRGBA(image.Rect(0, 0, int(float64(w)*scale), int(float64(h)*scale)))
xdraw.CatmullRom.Scale(dst, dst.Bounds(), img, b, xdraw.Over, nil)
return dst
}
// Auth holds resolved Anthropic API credentials.
type Auth struct {
name, header, value, beta, system string
}
// Name describes the credential mechanism in use (for logging).
func (a Auth) Name() string { return a.name }
func (a Auth) apply(req *http.Request) {
req.Header.Set(a.header, a.value)
if a.beta != "" {
req.Header.Set("anthropic-beta", a.beta)
}
}
// ResolveAuth finds Anthropic credentials: an API key, a bearer token, or the
// local Claude Code OAuth token (macOS keychain).
func ResolveAuth() (Auth, error) {
if k := os.Getenv("ANTHROPIC_API_KEY"); k != "" {
return Auth{name: "ANTHROPIC_API_KEY (x-api-key)", header: "x-api-key", value: k}, nil
}
bearer := func(name, tok string) Auth {
return Auth{
name: name,
header: "authorization", value: "Bearer " + tok,
beta: "oauth-2025-04-20",
system: "You are Claude Code, Anthropic's official CLI for Claude.",
}
}
if t := os.Getenv("ANTHROPIC_AUTH_TOKEN"); t != "" {
return bearer("ANTHROPIC_AUTH_TOKEN (Bearer)", t), nil
}
if tok := claudeCodeOAuthToken(); tok != "" {
return bearer("Claude Code OAuth, macOS keychain (Bearer)", tok), nil
}
return Auth{}, fmt.Errorf("no credentials: set ANTHROPIC_API_KEY or ANTHROPIC_AUTH_TOKEN")
}
func claudeCodeOAuthToken() string {
if runtime.GOOS != "darwin" {
return ""
}
out, err := exec.Command("security", "find-generic-password", "-s", "Claude Code-credentials", "-w").Output()
if err != nil {
return ""
}
var creds struct {
ClaudeAiOauth struct {
AccessToken string `json:"accessToken"`
} `json:"claudeAiOauth"`
}
if err := json.Unmarshal(out, &creds); err != nil {
return ""
}
return creds.ClaudeAiOauth.AccessToken
}