diff --git a/internal/bencode/validate.go b/internal/bencode/validate.go new file mode 100644 index 0000000..6b040e6 --- /dev/null +++ b/internal/bencode/validate.go @@ -0,0 +1,250 @@ +// Package bencode provides a LangSec-style structural validator for bencoded +// data (BEP 3). Inspired by Sassaman & Patterson, "The Science of Insecurity": +// recognize completely against a bounded grammar before any execution +// (decoding, indexing, recursion) touches the input. +// +// Validate walks the bytes without allocating the decoded tree. It enforces +// configurable limits on nesting depth, individual string length, list/dict +// element counts, and total input size. A successful Validate guarantees the +// input is well-formed bencode within those bounds — only then is it safe to +// hand to a permissive decoder like github.com/zeebo/bencode. +package bencode + +import ( + "errors" + "fmt" +) + +// Limits bounds the structural complexity of a bencoded payload. +// Callers should pick limits sized for their threat model: a .torrent file +// from an untrusted source can be large and deeply nested; a peer-protocol +// extension handshake should be small and shallow. +type Limits struct { + MaxBytes int // total input size; 0 means no overall byte cap + MaxDepth int // nesting depth of lists+dicts; 0 forbids any container + MaxStrLen int // max length of any single byte string + MaxListLen int // max elements per list + MaxKeysPerDict int // max keys per dict +} + +// TorrentLimits is sized for hybrid v1+v2 .torrent files. The pieces string +// and v2 piece-layer entries can be megabytes for large content. File trees +// (BEP 52) rarely exceed depth ~8 in practice, so 64 is generous DoS bound. +var TorrentLimits = Limits{ + MaxBytes: 16 << 20, // 16 MiB + MaxDepth: 64, + MaxStrLen: 8 << 20, // 8 MiB — covers very long pieces strings + MaxListLen: 1 << 17, // 131072 — files in a huge multi-file torrent + MaxKeysPerDict: 1 << 14, // 16384 — file tree directory fan-out +} + +// PeerMessageLimits is sized for BEP 10 extension handshakes and BEP 9 +// metadata-piece headers. These travel on a peer connection and should be +// small; a few-kilobyte cap kills oversized-allocation attempts at the door. +var PeerMessageLimits = Limits{ + MaxBytes: 1 << 20, // 1 MiB — extension handshakes can carry feature dicts + MaxDepth: 16, + MaxStrLen: 256 << 10, // 256 KiB + MaxListLen: 1024, + MaxKeysPerDict: 256, +} + +// TrackerResponseLimits is sized for BEP 3 tracker responses. The peers list +// (compact or dictionary form) is the dominant size, capped here at ~1 MiB. +var TrackerResponseLimits = Limits{ + MaxBytes: 2 << 20, // 2 MiB + MaxDepth: 16, + MaxStrLen: 1 << 20, // 1 MiB — compact peers string for huge swarms + MaxListLen: 65536, + MaxKeysPerDict: 256, +} + +// Validation errors. Callers can use errors.Is to distinguish them, but the +// wrapped error message contains the offset and is enough for diagnostics. +var ( + ErrTooLarge = errors.New("bencode: input exceeds MaxBytes") + ErrEOF = errors.New("bencode: unexpected end of input") + ErrTrailingData = errors.New("bencode: trailing data after value") + ErrSyntax = errors.New("bencode: malformed token") + ErrDepth = errors.New("bencode: nesting exceeds MaxDepth") + ErrStringTooLong = errors.New("bencode: string exceeds MaxStrLen") + ErrListTooLong = errors.New("bencode: list exceeds MaxListLen") + ErrDictTooLarge = errors.New("bencode: dict exceeds MaxKeysPerDict") + ErrIntegerSyntax = errors.New("bencode: malformed integer") + ErrNegativeLength = errors.New("bencode: negative string length") +) + +// Validate parses data as bencode and rejects it if any structural rule +// or limit is violated. It does not allocate the decoded value. +func Validate(data []byte, lim Limits) error { + if lim.MaxBytes > 0 && len(data) > lim.MaxBytes { + return fmt.Errorf("%w: got %d", ErrTooLarge, len(data)) + } + p := &parser{data: data, lim: lim} + if err := p.value(0); err != nil { + return err + } + if p.pos != len(data) { + return fmt.Errorf("%w: %d trailing byte(s)", ErrTrailingData, len(data)-p.pos) + } + return nil +} + +type parser struct { + data []byte + pos int + lim Limits +} + +func (p *parser) value(depth int) error { + if p.pos >= len(p.data) { + return ErrEOF + } + b := p.data[p.pos] + switch { + case b == 'i': + return p.integer() + case b == 'l': + return p.list(depth) + case b == 'd': + return p.dict(depth) + case b >= '0' && b <= '9': + _, err := p.byteString() + return err + default: + return fmt.Errorf("%w: unexpected %q at offset %d", ErrSyntax, b, p.pos) + } +} + +// integer parses ie. Bencode forbids leading zeros except for "0" +// and forbids "-0". We enforce both since they signal a malformed encoder. +func (p *parser) integer() error { + start := p.pos + p.pos++ // consume 'i' + if p.pos >= len(p.data) { + return ErrEOF + } + digitStart := p.pos + if p.data[p.pos] == '-' { + p.pos++ + if p.pos >= len(p.data) { + return ErrEOF + } + } + if p.pos >= len(p.data) || p.data[p.pos] < '0' || p.data[p.pos] > '9' { + return fmt.Errorf("%w at offset %d", ErrIntegerSyntax, start) + } + first := p.pos + for p.pos < len(p.data) && p.data[p.pos] >= '0' && p.data[p.pos] <= '9' { + p.pos++ + } + digits := p.data[first:p.pos] + // Disallow leading zeros: "i0e" ok, "i00e" / "i01e" / "i-0e" not. + if len(digits) > 1 && digits[0] == '0' { + return fmt.Errorf("%w: leading zero at offset %d", ErrIntegerSyntax, start) + } + if digitStart != first && digits[0] == '0' { + // negative zero: "-0" + return fmt.Errorf("%w: negative zero at offset %d", ErrIntegerSyntax, start) + } + if p.pos >= len(p.data) || p.data[p.pos] != 'e' { + return fmt.Errorf("%w: missing 'e' at offset %d", ErrIntegerSyntax, p.pos) + } + p.pos++ // consume 'e' + return nil +} + +// byteString parses : and returns the byte content. +func (p *parser) byteString() ([]byte, error) { + start := p.pos + // Parse length digits — bencode does NOT allow leading zeros for length, + // except length "0" itself. + for p.pos < len(p.data) && p.data[p.pos] >= '0' && p.data[p.pos] <= '9' { + p.pos++ + } + if p.pos == start { + return nil, fmt.Errorf("%w: expected digit at offset %d", ErrSyntax, start) + } + digits := p.data[start:p.pos] + if len(digits) > 1 && digits[0] == '0' { + return nil, fmt.Errorf("%w: leading zero in string length at offset %d", ErrSyntax, start) + } + // Parse length integer; cap it to int range. We bound by MaxStrLen so + // even a multi-gig declared length is rejected before we look at memory. + var length int + for _, d := range digits { + // Overflow guard — the MaxStrLen check below catches plausible values. + if length > (1<<31-1)/10 { + return nil, fmt.Errorf("%w: length overflow at offset %d", ErrStringTooLong, start) + } + length = length*10 + int(d-'0') + } + if length < 0 { + return nil, ErrNegativeLength + } + if p.lim.MaxStrLen > 0 && length > p.lim.MaxStrLen { + return nil, fmt.Errorf("%w: declared %d at offset %d", ErrStringTooLong, length, start) + } + if p.pos >= len(p.data) || p.data[p.pos] != ':' { + return nil, fmt.Errorf("%w: missing ':' at offset %d", ErrSyntax, p.pos) + } + p.pos++ // consume ':' + if p.pos+length > len(p.data) { + return nil, fmt.Errorf("%w: declared length %d exceeds remaining %d", ErrEOF, length, len(p.data)-p.pos) + } + out := p.data[p.pos : p.pos+length] + p.pos += length + return out, nil +} + +func (p *parser) list(depth int) error { + if depth+1 > p.lim.MaxDepth { + return fmt.Errorf("%w: list at offset %d", ErrDepth, p.pos) + } + p.pos++ // consume 'l' + count := 0 + for p.pos < len(p.data) && p.data[p.pos] != 'e' { + if p.lim.MaxListLen > 0 && count >= p.lim.MaxListLen { + return fmt.Errorf("%w: at offset %d", ErrListTooLong, p.pos) + } + if err := p.value(depth + 1); err != nil { + return err + } + count++ + } + if p.pos >= len(p.data) { + return fmt.Errorf("%w: unterminated list", ErrEOF) + } + p.pos++ // consume 'e' + return nil +} + +func (p *parser) dict(depth int) error { + if depth+1 > p.lim.MaxDepth { + return fmt.Errorf("%w: dict at offset %d", ErrDepth, p.pos) + } + p.pos++ // consume 'd' + keys := 0 + for p.pos < len(p.data) && p.data[p.pos] != 'e' { + if p.lim.MaxKeysPerDict > 0 && keys >= p.lim.MaxKeysPerDict { + return fmt.Errorf("%w: at offset %d", ErrDictTooLarge, p.pos) + } + // Key must be a byte string. + if !(p.data[p.pos] >= '0' && p.data[p.pos] <= '9') { + return fmt.Errorf("%w: dict key at offset %d not a string", ErrSyntax, p.pos) + } + if _, err := p.byteString(); err != nil { + return err + } + // Value + if err := p.value(depth + 1); err != nil { + return err + } + keys++ + } + if p.pos >= len(p.data) { + return fmt.Errorf("%w: unterminated dict", ErrEOF) + } + p.pos++ // consume 'e' + return nil +} diff --git a/internal/bencode/validate_test.go b/internal/bencode/validate_test.go new file mode 100644 index 0000000..810aefd --- /dev/null +++ b/internal/bencode/validate_test.go @@ -0,0 +1,184 @@ +package bencode + +import ( + "errors" + "strings" + "testing" +) + +// looseLimits accepts almost anything — for tests that only care about +// structural correctness, not size bounds. +var looseLimits = Limits{ + MaxBytes: 1 << 30, + MaxDepth: 64, + MaxStrLen: 1 << 24, + MaxListLen: 1 << 16, + MaxKeysPerDict: 1 << 14, +} + +func TestValidate_Valid(t *testing.T) { + cases := []struct { + name string + data string + }{ + {"empty string", "0:"}, + {"short string", "5:hello"}, + {"zero", "i0e"}, + {"negative", "i-42e"}, + {"large positive", "i9223372036854775807e"}, + {"empty list", "le"}, + {"list of ints", "li1ei2ei3ee"}, + {"empty dict", "de"}, + {"single-key dict", "d3:foo3:bare"}, + {"nested dict", "d4:infod4:name5:hellodes ee"[:21] + "ee"}, + {"multi-key dict", "d1:ai1e1:bi2ee"}, + {"nested list", "lli1eel3:fooee"}, + {"realistic torrent meta", "d8:announce20:http://tracker.test/4:infod6:lengthi100e4:name4:test12:piece lengthi16384e6:pieces20:" + strings.Repeat("\x00", 20) + "ee"}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + if err := Validate([]byte(c.data), looseLimits); err != nil { + t.Errorf("expected ok, got %v", err) + } + }) + } +} + +func TestValidate_Malformed(t *testing.T) { + cases := []struct { + name string + data string + want error + }{ + {"empty input", "", ErrEOF}, + {"trailing data", "i1ee", ErrTrailingData}, + {"int missing e", "i42", ErrIntegerSyntax}, + {"int empty", "ie", ErrIntegerSyntax}, + {"int leading zero", "i01e", ErrIntegerSyntax}, + {"int negative zero", "i-0e", ErrIntegerSyntax}, + {"int just minus", "i-e", ErrIntegerSyntax}, + {"string missing colon", "5hello", ErrSyntax}, + {"string declared too long", "10:hi", ErrEOF}, + {"string leading-zero length", "01:x", ErrSyntax}, + {"list unterminated", "li1e", ErrEOF}, + {"dict unterminated", "d3:fooi1e", ErrEOF}, + {"dict non-string key", "di1ei1ee", ErrSyntax}, + {"unknown leading byte", "x", ErrSyntax}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + err := Validate([]byte(c.data), looseLimits) + if err == nil { + t.Fatalf("expected error, got nil") + } + if !errors.Is(err, c.want) { + t.Errorf("got %v, want errors.Is %v", err, c.want) + } + }) + } +} + +func TestValidate_DepthLimit(t *testing.T) { + // Build le>e — depth N + build := func(n int) string { + return strings.Repeat("l", n) + strings.Repeat("e", n) + } + lim := looseLimits + lim.MaxDepth = 3 + + if err := Validate([]byte(build(3)), lim); err != nil { + t.Errorf("depth 3 with MaxDepth=3 should pass, got %v", err) + } + err := Validate([]byte(build(4)), lim) + if err == nil { + t.Fatal("depth 4 with MaxDepth=3 should fail") + } + if !errors.Is(err, ErrDepth) { + t.Errorf("expected ErrDepth, got %v", err) + } + + // Mixed dict+list nesting also counts. + mixed := "d1:al1:al1:al1:aleeeee" // d > l > l > l (depth 4) + if err := Validate([]byte(mixed), lim); !errors.Is(err, ErrDepth) { + t.Errorf("expected ErrDepth on mixed nesting, got %v", err) + } +} + +func TestValidate_StringTooLong(t *testing.T) { + lim := looseLimits + lim.MaxStrLen = 5 + + if err := Validate([]byte("5:hello"), lim); err != nil { + t.Errorf("len=5 with MaxStrLen=5 should pass, got %v", err) + } + err := Validate([]byte("6:hello!"), lim) + if !errors.Is(err, ErrStringTooLong) { + t.Errorf("expected ErrStringTooLong, got %v", err) + } + + // Declared length only — never even reads the bytes. + err = Validate([]byte("99999999:short"), lim) + if !errors.Is(err, ErrStringTooLong) { + t.Errorf("expected ErrStringTooLong on declared-only oversize, got %v", err) + } +} + +func TestValidate_ListTooLong(t *testing.T) { + lim := looseLimits + lim.MaxListLen = 3 + + if err := Validate([]byte("li1ei2ei3ee"), lim); err != nil { + t.Errorf("3 items with MaxListLen=3 should pass, got %v", err) + } + err := Validate([]byte("li1ei2ei3ei4ee"), lim) + if !errors.Is(err, ErrListTooLong) { + t.Errorf("expected ErrListTooLong, got %v", err) + } +} + +func TestValidate_DictTooLarge(t *testing.T) { + lim := looseLimits + lim.MaxKeysPerDict = 2 + + if err := Validate([]byte("d1:ai1e1:bi2ee"), lim); err != nil { + t.Errorf("2 keys with MaxKeysPerDict=2 should pass, got %v", err) + } + err := Validate([]byte("d1:ai1e1:bi2e1:ci3ee"), lim) + if !errors.Is(err, ErrDictTooLarge) { + t.Errorf("expected ErrDictTooLarge, got %v", err) + } +} + +func TestValidate_TooLarge(t *testing.T) { + lim := looseLimits + lim.MaxBytes = 4 + + if err := Validate([]byte("3:foo"), lim); !errors.Is(err, ErrTooLarge) { + t.Errorf("expected ErrTooLarge for 5-byte input with MaxBytes=4, got %v", err) + } +} + +func TestValidate_NoAlloc(t *testing.T) { + // Quick sanity check: a giant declared-but-not-sent string is rejected + // before we'd allocate anything. + huge := "9999999999:short" + if err := Validate([]byte(huge), TorrentLimits); err == nil { + t.Fatal("expected rejection of declared-only multi-GB string") + } +} + +func TestValidate_PresetLimits(t *testing.T) { + // Valid simple message under each preset + simple := []byte("d4:nope0:e") + for name, lim := range map[string]Limits{ + "TorrentLimits": TorrentLimits, + "PeerMessageLimits": PeerMessageLimits, + "TrackerResponseLimits": TrackerResponseLimits, + } { + t.Run(name, func(t *testing.T) { + if err := Validate(simple, lim); err != nil { + t.Errorf("%s rejected trivial input: %v", name, err) + } + }) + } +} diff --git a/internal/client/metadata.go b/internal/client/metadata.go index a113686..466b890 100644 --- a/internal/client/metadata.go +++ b/internal/client/metadata.go @@ -6,6 +6,8 @@ import ( "fmt" "github.com/zeebo/bencode" + + wbencode "weightless/internal/bencode" ) // FetchMetadata fetches the info dictionary from a peer using BEP 9 metadata exchange. @@ -45,13 +47,18 @@ func (p *PeerConn) FetchMetadata(ctx context.Context, infoHash []byte) ([]byte, return nil, fmt.Errorf("could not find end of bencoded header in piece %d", i) } + // Validate the header bytes structurally before decoding (BEP 9 dict). + headerBytes := msg.Payload[1 : 1+dictEnd] + if err := wbencode.Validate(headerBytes, wbencode.PeerMessageLimits); err != nil { + return nil, fmt.Errorf("validate metadata header: %w", err) + } // Decode the header to check msg_type var header struct { MsgType int `bencode:"msg_type"` Piece int `bencode:"piece"` Total int `bencode:"total_size"` } - if err := bencode.DecodeBytes(msg.Payload[1:1+dictEnd], &header); err != nil { + if err := bencode.DecodeBytes(headerBytes, &header); err != nil { return nil, fmt.Errorf("decode metadata header: %w", err) } diff --git a/internal/client/peer.go b/internal/client/peer.go index 9ecfb58..2f410cd 100644 --- a/internal/client/peer.go +++ b/internal/client/peer.go @@ -9,6 +9,8 @@ import ( "time" "github.com/zeebo/bencode" + + wbencode "weightless/internal/bencode" ) // Extension ID mapping for BEP 10 @@ -144,6 +146,9 @@ func (p *PeerConn) readExtendedHandshake() error { return fmt.Errorf("expected extended handshake (id 0), got %d", m.Payload[0]) } + if err := wbencode.Validate(m.Payload[1:], wbencode.PeerMessageLimits); err != nil { + return fmt.Errorf("validate extended handshake: %w", err) + } var handshake struct { M map[string]int `bencode:"m"` Metadata int `bencode:"metadata_size"` diff --git a/internal/client/tracker.go b/internal/client/tracker.go index e5c9c78..10d7274 100644 --- a/internal/client/tracker.go +++ b/internal/client/tracker.go @@ -11,6 +11,8 @@ import ( "strconv" "github.com/zeebo/bencode" + + wbencode "weightless/internal/bencode" ) // Announce sends a request to the tracker and returns a list of peer addresses (IP:Port). @@ -52,6 +54,10 @@ func Announce(ctx context.Context, trackerURL, infoHash, peerID string, port int return nil, fmt.Errorf("read tracker response: %w", err) } + if err := wbencode.Validate(data, wbencode.TrackerResponseLimits); err != nil { + return nil, fmt.Errorf("validate tracker response: %w", err) + } + var trackerResponse struct { FailureReason string `bencode:"failure reason"` Peers string `bencode:"peers"` diff --git a/internal/torrent/parse.go b/internal/torrent/parse.go index 48ac44d..2d6492c 100644 --- a/internal/torrent/parse.go +++ b/internal/torrent/parse.go @@ -5,8 +5,16 @@ import ( "path/filepath" "github.com/zeebo/bencode" + + wbencode "weightless/internal/bencode" ) +// maxFileTreeDepth bounds BEP 52 file-tree recursion. The bencode validator +// already caps overall nesting depth, but we keep an explicit guard here as +// defense-in-depth — walkFileTree must not recurse on attacker-controlled +// structure even if a future caller skips Validate. +const maxFileTreeDepth = 64 + // TorrentMeta holds parsed torrent metadata for display or registry use. type TorrentMeta struct { Name string `json:"name"` @@ -23,7 +31,12 @@ type FileEntry struct { } // Parse decodes a bencoded .torrent file into display-friendly metadata. +// LangSec: structurally validate against TorrentLimits before letting the +// permissive decoder allocate anything. func Parse(data []byte) (TorrentMeta, error) { + if err := wbencode.Validate(data, wbencode.TorrentLimits); err != nil { + return TorrentMeta{}, fmt.Errorf("torrent validate: %w", err) + } var raw map[string]interface{} if err := bencode.DecodeBytes(data, &raw); err != nil { return TorrentMeta{}, fmt.Errorf("bencode decode: %w", err) @@ -80,7 +93,7 @@ func Parse(data []byte) (TorrentMeta, error) { // If no v1 files list, try v2 file tree (BEP 52) if len(meta.Files) == 0 { if fileTree, ok := info["file tree"].(map[string]interface{}); ok { - meta.Files = walkFileTree(fileTree, "") + meta.Files = walkFileTree(fileTree, "", 0) for _, f := range meta.Files { meta.TotalSize += f.Length } @@ -96,7 +109,12 @@ func Parse(data []byte) (TorrentMeta, error) { } // walkFileTree recursively walks a BEP 52 file tree and collects file entries. -func walkFileTree(tree map[string]interface{}, prefix string) []FileEntry { +// depth is the current recursion level — bailing out at maxFileTreeDepth +// caps stack use even if upstream validation was bypassed. +func walkFileTree(tree map[string]interface{}, prefix string, depth int) []FileEntry { + if depth > maxFileTreeDepth { + return nil + } var files []FileEntry for name, val := range tree { node, ok := val.(map[string]interface{}) @@ -113,7 +131,7 @@ func walkFileTree(tree map[string]interface{}, prefix string) []FileEntry { files = append(files, FileEntry{Path: path, Length: length}) } else { // Directory node: recurse - files = append(files, walkFileTree(node, path)...) + files = append(files, walkFileTree(node, path, depth+1)...) } } return files diff --git a/internal/torrent/parse_test.go b/internal/torrent/parse_test.go index 65cde99..10eb0ff 100644 --- a/internal/torrent/parse_test.go +++ b/internal/torrent/parse_test.go @@ -3,6 +3,7 @@ package torrent import ( "os" "path/filepath" + "strings" "testing" ) @@ -79,3 +80,30 @@ func TestParseDirectory(t *testing.T) { t.Errorf("expected 2 files, got %d", len(meta.Files)) } } + +func TestParseRejectsMalformedBencode(t *testing.T) { + cases := []struct { + name string + data []byte + }{ + {"empty", []byte{}}, + {"trailing garbage", []byte("d3:foo3:baree!!")}, + {"unterminated dict", []byte("d3:foo3:bar")}, + {"non-bencode", []byte("hello world")}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + if _, err := Parse(c.data); err == nil { + t.Errorf("expected error for %s, got nil", c.name) + } + }) + } +} + +func TestParseRejectsDeeplyNested(t *testing.T) { + // Bencode validator caps depth at 64; build 70-deep nested list. + data := []byte(strings.Repeat("l", 70) + strings.Repeat("e", 70)) + if _, err := Parse(data); err == nil { + t.Error("expected depth-limit rejection, got nil") + } +}