diff --git a/CHANGELOG.md b/CHANGELOG.md index 52c4c64f..03fb5d0d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,7 @@ ## Unreleased: mitmproxy_rs next +- Refactor protobuf-YAML conversion, fix missing unknown fields on nested known messages. + ## 22 November 2025: mitmproxy_rs 0.12.8 diff --git a/mitmproxy-contentviews/src/protobuf/existing_proto_definitions.rs b/mitmproxy-contentviews/src/protobuf/existing_proto_definitions.rs index 678e0f53..f95eb8c0 100644 --- a/mitmproxy-contentviews/src/protobuf/existing_proto_definitions.rs +++ b/mitmproxy-contentviews/src/protobuf/existing_proto_definitions.rs @@ -7,6 +7,7 @@ use std::path::Path; pub(super) struct DescriptorWithDeps { pub descriptor: MessageDescriptor, + #[allow(unused)] pub dependencies: Vec, } diff --git a/mitmproxy-contentviews/src/protobuf/proto_to_yaml.rs b/mitmproxy-contentviews/src/protobuf/proto_to_yaml.rs index 47111e29..fd9cf404 100644 --- a/mitmproxy-contentviews/src/protobuf/proto_to_yaml.rs +++ b/mitmproxy-contentviews/src/protobuf/proto_to_yaml.rs @@ -1,35 +1,41 @@ +use crate::protobuf::raw_to_proto::{self, GuessedFieldType, guess_varlen_type}; use crate::protobuf::view_protobuf::tags; -use protobuf::MessageDyn; +use anyhow::Context; /// Parsed protobuf message => YAML value use protobuf::descriptor::field_descriptor_proto::Type; -use protobuf::descriptor::field_descriptor_proto::Type::{ - TYPE_BYTES, TYPE_FIXED32, TYPE_FIXED64, TYPE_UINT64, -}; -use protobuf::reflect::{ReflectFieldRef, ReflectValueRef}; +use protobuf::descriptor::field_descriptor_proto::Type::{TYPE_FIXED32, TYPE_FIXED64, TYPE_UINT64}; +use protobuf::reflect::{MessageDescriptor, ReflectFieldRef, ReflectValueRef}; +use protobuf::{MessageDyn, UnknownValueRef}; use serde_yaml::value::TaggedValue; -use serde_yaml::{Mapping, Number, Value}; +use serde_yaml::{Number, Value}; +use std::collections::BTreeMap; use std::ops::Deref; -pub(super) fn message_to_yaml(message: &dyn MessageDyn) -> Value { - let mut ret = Mapping::new(); +pub(super) fn decode_proto_to_yaml( + data: &[u8], + descriptor: &MessageDescriptor, +) -> anyhow::Result { + let message = descriptor + .parse_from_bytes(data) + .context("failed to parse protobuf")?; + Ok(message_to_yaml_value(message.as_ref())) +} - for field in message.descriptor_dyn().fields() { - let is_unknown_field = field.name().starts_with("unknown_field_"); - let key = if is_unknown_field { - Value::from(field.number()) - } else { - Value::from(field.name()) - }; - let field_type = field - .proto() - .type_ - .map(|t| t.enum_value_or(TYPE_BYTES)) - .unwrap_or(TYPE_BYTES); +/// Convert a protobuf message to a YAML [`Value::Mapping`]. +/// +/// Handles known and unknown fields and sorts them by field number. +/// For unknown bytes-values, will attempt to guess whether they're strings, +/// messages, or bytes. +fn message_to_yaml_value(message: &dyn MessageDyn) -> Value { + // (field_number, key, value) + let mut all_field_values: Vec<(i32, Value, Value)> = Vec::new(); + // Go through the known fields + for field in message.descriptor_dyn().fields() { let value = match field.get_reflect(message) { ReflectFieldRef::Optional(x) => { if let Some(x) = x.value() { - value_to_yaml(x, field_type, is_unknown_field) + value_to_yaml(x) } else { continue; } @@ -38,11 +44,7 @@ pub(super) fn message_to_yaml(message: &dyn MessageDyn) -> Value { if x.is_empty() { continue; } - Value::Sequence( - x.into_iter() - .map(|x| value_to_yaml(x, field_type, is_unknown_field)) - .collect(), - ) + Value::Sequence(x.into_iter().map(|x| value_to_yaml(x)).collect()) } ReflectFieldRef::Map(x) => { if x.is_empty() { @@ -50,25 +52,106 @@ pub(super) fn message_to_yaml(message: &dyn MessageDyn) -> Value { } Value::Mapping( x.into_iter() - .map(|(k, v)| { - ( - value_to_yaml(k, field_type, is_unknown_field), - value_to_yaml(v, field_type, is_unknown_field), - ) - }) + .map(|(k, v)| (value_to_yaml(k), value_to_yaml(v))) .collect(), ) } }; - ret.insert(key, value); + + all_field_values.push((field.number(), Value::from(field.name()), value)); + } + + // Group the unknown values by field number + let mut unknown_field_groups: BTreeMap> = BTreeMap::new(); + for (field_number, value) in message.unknown_fields_dyn().iter() { + unknown_field_groups + .entry(field_number as i32) + .or_default() + .push(value); } - Value::Mapping(ret) + + for (field_number, unknown_values) in unknown_field_groups.into_iter() { + // With the fields grouped, see if we should expect just numbers, or bytes which we need to guess the type of + let varlen_count = unknown_values + .iter() + .filter(|v| matches!(v, UnknownValueRef::LengthDelimited(_))) + .count(); + + let values: Vec = if varlen_count == 0 || varlen_count != unknown_values.len() { + // We have only numbers (or bad protobuf with mixed types) + unknown_values + .into_iter() + .map(|v| match v { + UnknownValueRef::Fixed32(num) => tag_number(Number::from(num), TYPE_FIXED32), + UnknownValueRef::Fixed64(num) => tag_number(Number::from(num), TYPE_FIXED64), + UnknownValueRef::Varint(num) => tag_number(Number::from(num), TYPE_UINT64), + UnknownValueRef::LengthDelimited(data) => { + // There *shouldn't* be mixed types, but since we can deal with this edge case, let's. + // We're running the probe per item, but again, that'll do. + match guess_varlen_type(&[data]) { + GuessedFieldType::String(strings) => { + strings.into_iter().map(Value::String).next() + } + GuessedFieldType::Unknown => Some( + // Let's take advantage of value_to_yaml() + value_to_yaml(ReflectValueRef::Bytes(data)), + ), + GuessedFieldType::Message(messages) => messages + .into_iter() + .map(|m| message_to_yaml_value(m.deref())) + .next(), + } + .expect("guess_varlen_type() provided us with no values") + } + }) + .collect() + } else { + // Collect the bytes and try to guess what they represent together + let varlen_bytes: Vec<&[u8]> = unknown_values + .into_iter() + .map(|v| match v { + UnknownValueRef::LengthDelimited(data) => data, + _ => unreachable!("already checked; there should be bytes only"), + }) + .collect(); + + match raw_to_proto::guess_varlen_type(&varlen_bytes) { + GuessedFieldType::String(strings) => { + strings.into_iter().map(Value::String).collect() + } + GuessedFieldType::Unknown => varlen_bytes + .into_iter() + .map(|data| { + // Let's take advantage of value_to_yaml() + value_to_yaml(ReflectValueRef::Bytes(data)) + }) + .collect(), + GuessedFieldType::Message(messages) => messages + .into_iter() + .map(|m| message_to_yaml_value(m.deref())) + .collect(), + } + }; + + // Move out the only Value, or wrap the vector in a YAML Sequence + let value = match values { + v if v.len() == 1 => v.into_iter().next().unwrap(), + v => Value::Sequence(v), + }; + all_field_values.push((field_number, Value::from(field_number), value)); + } + + // Sort the fields by the field number and collect our Mapping + all_field_values.sort_by_key(|t| t.0); + + Value::Mapping(all_field_values.into_iter().map(|t| (t.1, t.2)).collect()) } -fn value_to_yaml(x: ReflectValueRef, field_type: Type, is_unknown: bool) -> Value { +/// Convert `x` into a [`Value`] +fn value_to_yaml(x: ReflectValueRef) -> Value { match x { - ReflectValueRef::U32(x) => tag_number(Number::from(x), field_type, is_unknown), - ReflectValueRef::U64(x) => tag_number(Number::from(x), field_type, is_unknown), + ReflectValueRef::U32(x) => Value::Number(Number::from(x)), + ReflectValueRef::U64(x) => Value::Number(Number::from(x)), ReflectValueRef::I32(x) => Value::Number(Number::from(x)), ReflectValueRef::I64(x) => Value::Number(Number::from(x)), ReflectValueRef::F32(x) => Value::Number(Number::from(x)), @@ -83,27 +166,24 @@ fn value_to_yaml(x: ReflectValueRef, field_type: Type, is_unknown: bool) -> Valu .value_by_number(i) .map(|v| Value::String(v.name().to_string())) .unwrap_or_else(|| Value::Number(Number::from(i))), - ReflectValueRef::Message(m) => message_to_yaml(m.deref()), + ReflectValueRef::Message(m) => message_to_yaml_value(m.deref()), } } -fn tag_number(number: Number, field_type: Type, is_unknown: bool) -> Value { - if !is_unknown { - return Value::Number(number); - } - match field_type { - TYPE_UINT64 => Value::Tagged(Box::new(TaggedValue { - tag: tags::VARINT.clone(), - value: Value::Number(number), - })), - TYPE_FIXED64 => Value::Tagged(Box::new(TaggedValue { - tag: tags::FIXED64.clone(), - value: Value::Number(number), - })), - TYPE_FIXED32 => Value::Tagged(Box::new(TaggedValue { - tag: tags::FIXED32.clone(), +/// Tag a [`Number`] for YAML given a hint type +fn tag_number(number: Number, tag_type: Type) -> Value { + let tag = match tag_type { + TYPE_UINT64 => Some(tags::VARINT.clone()), + TYPE_FIXED64 => Some(tags::FIXED64.clone()), + TYPE_FIXED32 => Some(tags::FIXED32.clone()), + _ => None, + }; + + match tag { + Some(tag) => Value::Tagged(Box::new(TaggedValue { + tag, value: Value::Number(number), })), - _ => Value::Number(number), + None => Value::Number(number), } } diff --git a/mitmproxy-contentviews/src/protobuf/raw_to_proto.rs b/mitmproxy-contentviews/src/protobuf/raw_to_proto.rs index d1538fbf..29bf52be 100644 --- a/mitmproxy-contentviews/src/protobuf/raw_to_proto.rs +++ b/mitmproxy-contentviews/src/protobuf/raw_to_proto.rs @@ -1,64 +1,15 @@ -use crate::protobuf::existing_proto_definitions::DescriptorWithDeps; -use anyhow::Context; -use protobuf::descriptor::field_descriptor_proto::Label::LABEL_REPEATED; -use protobuf::descriptor::field_descriptor_proto::Type; -use protobuf::descriptor::field_descriptor_proto::Type::{ - TYPE_BYTES, TYPE_FIXED32, TYPE_FIXED64, TYPE_STRING, TYPE_UINT64, -}; -use protobuf::descriptor::{DescriptorProto, FieldDescriptorProto, FileDescriptorProto}; +use protobuf::MessageDyn; +use protobuf::descriptor::{DescriptorProto, FileDescriptorProto}; use protobuf::reflect::{FileDescriptor, MessageDescriptor}; -use protobuf::{EnumOrUnknown, MessageDyn, UnknownValueRef}; /// Existing protobuf definition + raw data => merged protobuf definition -use std::collections::BTreeMap; +use std::str; -enum GuessedFieldType { - String, - Message(Box), +pub(crate) enum GuessedFieldType { + String(Vec), + Message(Vec>), Unknown, } -/// Create a "merged" MessageDescriptor. Mostly a wrapper around `create_descriptor_proto`. -pub(super) fn merge_proto_and_descriptor( - data: &[u8], - desc: &DescriptorWithDeps, -) -> anyhow::Result { - let new_proto = create_descriptor_proto(data, &desc.descriptor)?; - - let descriptor = { - let mut file_descriptor_proto = desc.descriptor.file_descriptor_proto().clone(); - - let message_idx = file_descriptor_proto - .message_type - .iter() - .enumerate() - .filter_map(|(i, d)| (d.name() == desc.descriptor.name_to_package()).then_some(i)) - .next() - .context("failed to find existing message descriptor index")?; - file_descriptor_proto.message_type[message_idx] = new_proto; - - /* - XXX: Skipping this as it doesn't seem to bring any immediate benefits. - let dependencies = dependencies - .iter() - .cloned() - .filter(|d| d != existing.file_descriptor()) - .collect::>(); - */ - - FileDescriptor::new_dynamic(file_descriptor_proto, &desc.dependencies) - .context("failed to create new dynamic file descriptor")? - .message_by_package_relative_name(desc.descriptor.name_to_package()) - .with_context(|| { - format!( - "did not find {} in descriptor", - desc.descriptor.name_to_package() - ) - })? - }; - - Ok(descriptor) -} - /// Create a new (empty) MessageDescriptor for the given package and name. pub(super) fn new_empty_descriptor(package: Option, name: &str) -> MessageDescriptor { // Create nested descriptor protos. For example, if the name is OuterMessage.InnerMessage, @@ -89,107 +40,39 @@ pub(super) fn new_empty_descriptor(package: Option, name: &str) -> Messa .unwrap() } -/// Create a DescriptorProto that combines the `existing` MessageDescriptor with (guessed) -/// metadata for all unknown fields in the protobuf `data`. -fn create_descriptor_proto( - data: &[u8], - existing: &MessageDescriptor, -) -> anyhow::Result { - let message = existing.parse_from_bytes(data).with_context(|| { - if existing.full_name().starts_with("Unknown") { - "invalid format".to_string() - } else { - format!("failed to parse {}", existing.full_name()) - } - })?; - - let mut descriptor = existing.proto().clone(); - - let mut field_groups: BTreeMap> = BTreeMap::new(); - for (field_number, value) in message.unknown_fields_dyn().iter() { - field_groups.entry(field_number).or_default().push(value); - } - - for (field_index, field_values) in field_groups.into_iter() { - let name = Some(format!("unknown_field_{field_index}")); - let mut add_int = |name: Option, typ: Type| { - descriptor.field.push(FieldDescriptorProto { - number: Some(field_index as i32), - name, - type_: Some(EnumOrUnknown::from(typ)), - ..Default::default() - }); - }; - match field_values[0] { - // We can't use float/double here because of NaN handling. - UnknownValueRef::Fixed32(_) => add_int(name, TYPE_FIXED32), - UnknownValueRef::Fixed64(_) => add_int(name, TYPE_FIXED64), - UnknownValueRef::Varint(_) => add_int(name, TYPE_UINT64), - UnknownValueRef::LengthDelimited(_) => { - let field_values = field_values - .iter() - .map(|x| match x { - UnknownValueRef::LengthDelimited(data) => Ok(*data), - _ => Err(anyhow::anyhow!("varying types in protobuf")), - }) - .collect::>>()?; +/// Given a set of protobuf varlen field values, try to figure out the best type to display them. +/// +/// In the case of [`GuessedFieldType::String`] and [`GuessedFieldType::Message`], +/// the decoded values are included. +pub(crate) fn guess_varlen_type(values: &[&[u8]]) -> GuessedFieldType { + // Try to decode as printable strings + let strings: Option> = values + .iter() + .map(|data| { + str::from_utf8(data).ok().and_then(|s| { + s.chars() + // Check that the string is printable + .all(|c| c.is_ascii_graphic() || c.is_ascii_whitespace()) + .then(|| s.to_string()) + }) + }) + .collect(); - match guess_field_type(existing, field_index, &field_values) { - GuessedFieldType::String => add_int(name, TYPE_STRING), - GuessedFieldType::Unknown => add_int(name, TYPE_BYTES), - GuessedFieldType::Message(m) => { - descriptor.field.push(FieldDescriptorProto { - name, - number: Some(field_index as i32), - type_name: Some(format!(".{}.{}", existing.full_name(), m.name())), - type_: Some(EnumOrUnknown::from(Type::TYPE_MESSAGE)), - ..Default::default() - }); - descriptor.nested_type.push(*m); - } - } - } - } - if field_values.len() > 1 { - descriptor - .field - .last_mut() - .expect("we just added this field") - .set_label(LABEL_REPEATED); - } + if let Some(strings) = strings { + return GuessedFieldType::String(strings); } - Ok(descriptor) -} - -/// Given all `values` of a field, guess its type. -fn guess_field_type( - parent: &MessageDescriptor, - field_index: u32, - values: &[&[u8]], -) -> GuessedFieldType { - if values.iter().all(|data| { - std::str::from_utf8(data).is_ok_and(|s| { - s.chars() - .all(|c| c.is_ascii_graphic() || c.is_ascii_whitespace()) - }) - }) { - return GuessedFieldType::String; - } + // Try to decode protobuf with an empty message type (all unknowns!) + let descriptor = new_empty_descriptor(None, "Unknown"); + let messages: Option> = values + .iter() + .map(|data| descriptor.parse_from_bytes(data).ok()) + .collect(); - // Try to parse as a nested message - let existing = new_empty_descriptor( - parent.file_descriptor_proto().package.clone(), - &format!("{}.UnknownField{}", parent.name_to_package(), field_index), - ); - if let Ok(descriptor) = create_descriptor_proto(values[0], &existing) - && values - .iter() - .skip(1) - .all(|data| descriptor.descriptor_dyn().parse_from_bytes(data).is_ok()) - { - return GuessedFieldType::Message(Box::new(descriptor)); + if let Some(messages) = messages { + return GuessedFieldType::Message(messages); } + // Fall back to unrecognized bytestrings GuessedFieldType::Unknown } diff --git a/mitmproxy-contentviews/src/protobuf/view_grpc.rs b/mitmproxy-contentviews/src/protobuf/view_grpc.rs index f67f495e..42a93409 100644 --- a/mitmproxy-contentviews/src/protobuf/view_grpc.rs +++ b/mitmproxy-contentviews/src/protobuf/view_grpc.rs @@ -188,6 +188,15 @@ mod tests { const TEST_YAML_KNOWN: &str = "example: 150\n\n---\n\nexample: 150\n"; + const TEST_WITH_EXTRA_GRPC: &[u8] = &[0, 0, 0, 0, 10, 8, 123, 18, 3, 8, 200, 3, 40, 171, 4]; + const TEST_WITH_EXTRA_YAML: &str = + "example: 123\nnested:\n example: 456\n5: 555 # !sint: -278\n"; + + const TEST_WITH_NESTED_EXTRA_GRPC: &[u8] = &[ + 0, 0, 0, 0, 13, 8, 123, 18, 6, 8, 200, 3, 16, 142, 5, 40, 171, 4, + ]; + const TEST_WITH_NESTED_EXTRA_YAML: &str = "example: 123\nnested:\n example: 456\n 2: 654 # !sint: 327\n5: 555 # !sint: -278\n"; + #[test] fn existing_proto() { let metadata = TestMetadata::default().with_protobuf_definitions(concat!( @@ -248,6 +257,34 @@ mod tests { assert_eq!(req, TEST_YAML_KNOWN); } + // When there's an extra unknown field on the root message + #[test] + fn existing_unknown() { + let metadata = TestMetadata::default() + .with_path("/example.nested.Service/Method") + .with_protobuf_definitions(concat!( + env!("CARGO_MANIFEST_DIR"), + "/testdata/protobuf/nested.proto" + )); + let req = GRPC.prettify(TEST_WITH_EXTRA_GRPC, &metadata).unwrap(); + assert_eq!(req, TEST_WITH_EXTRA_YAML); + } + + // When there's an extra unknown field on a nested known message + #[test] + fn existing_unknown_nested() { + let metadata = TestMetadata::default() + .with_path("/example.nested.Service/Method") + .with_protobuf_definitions(concat!( + env!("CARGO_MANIFEST_DIR"), + "/testdata/protobuf/nested.proto" + )); + let req = GRPC + .prettify(TEST_WITH_NESTED_EXTRA_GRPC, &metadata) + .unwrap(); + assert_eq!(req, TEST_WITH_NESTED_EXTRA_YAML); + } + /// When the existing proto definition does not match the wire data, /// but the wire data is still valid Protobuf, parse it raw. #[test] diff --git a/mitmproxy-contentviews/src/protobuf/view_protobuf.rs b/mitmproxy-contentviews/src/protobuf/view_protobuf.rs index 4abd7979..18d4ef62 100644 --- a/mitmproxy-contentviews/src/protobuf/view_protobuf.rs +++ b/mitmproxy-contentviews/src/protobuf/view_protobuf.rs @@ -1,7 +1,5 @@ use crate::protobuf::existing_proto_definitions::DescriptorWithDeps; -use crate::protobuf::{ - existing_proto_definitions, proto_to_yaml, raw_to_proto, reencode, yaml_to_pretty, -}; +use crate::protobuf::{existing_proto_definitions, proto_to_yaml, reencode, yaml_to_pretty}; use crate::{Metadata, Prettify, Reencode}; use anyhow::{Context, Result}; use log::info; @@ -82,13 +80,7 @@ impl Protobuf { return Ok("{} # empty protobuf message".to_string()); } - let descriptor = raw_to_proto::merge_proto_and_descriptor(data, descriptor)?; - - // Parse protobuf and convert to YAML - let message = descriptor - .parse_from_bytes(data) - .context("Error parsing protobuf")?; - let yaml_value = proto_to_yaml::message_to_yaml(message.as_ref()); + let yaml_value = proto_to_yaml::decode_proto_to_yaml(data, &descriptor.descriptor)?; let yaml_str = serde_yaml::to_string(&yaml_value).context("Failed to convert to YAML")?; yaml_to_pretty::apply_replacements(&yaml_str) @@ -219,6 +211,19 @@ mod tests { const VARINT_PRETTY_YAML: &str = "example: 150\n"; + const EXTRA_STRING_PROTO: &[u8] = &[ + 8, 10, 26, 5, 104, 101, 108, 108, 111, 26, 5, 116, 104, 101, 114, 101, + ]; + const EXTRA_STRING_YAML: &str = "example: 10\n3:\n- hello\n- there\n"; + + const EXTRA_MESSAGE_PROTO: &[u8] = &[ + 8, 42, 18, 36, 26, 28, 97, 32, 110, 101, 115, 116, 101, 100, 32, 109, 101, 115, 115, + 97, 103, 101, 32, 119, 105, 116, 104, 32, 98, 121, 116, 101, 115, 33, 34, 4, 222, 173, + 190, 239, + ]; + const EXTRA_MESSAGE_YAML: &str = + "example: 42\n2:\n 3: a nested message with bytes!\n 4: !binary deadbeef\n"; + #[test] fn prettify() { let metadata = TestMetadata::default().with_protobuf_definitions(concat!( @@ -241,6 +246,27 @@ mod tests { assert_eq!(result, string::YAML); } + /// When the wire data has additional fields + #[test] + fn extra_unknown_fields() { + let metadata = TestMetadata::default().with_protobuf_definitions(concat!( + env!("CARGO_MANIFEST_DIR"), + "/testdata/protobuf/simple.proto" + )); + let result = Protobuf.prettify(EXTRA_STRING_PROTO, &metadata).unwrap(); + assert_eq!(result, EXTRA_STRING_YAML); + } + /// When the wire data has an additional unknown message + #[test] + fn extra_unknown_message() { + let metadata = TestMetadata::default().with_protobuf_definitions(concat!( + env!("CARGO_MANIFEST_DIR"), + "/testdata/protobuf/simple.proto" + )); + let result = Protobuf.prettify(EXTRA_MESSAGE_PROTO, &metadata).unwrap(); + assert_eq!(result, EXTRA_MESSAGE_YAML); + } + #[test] fn reencode() { let metadata = TestMetadata::default().with_protobuf_definitions(concat!(