diff --git a/apps/limiter/src/lim_otel_log_filter.erl b/apps/limiter/src/lim_otel_log_filter.erl new file mode 100644 index 0000000..bd806b4 --- /dev/null +++ b/apps/limiter/src/lim_otel_log_filter.erl @@ -0,0 +1,65 @@ +%%% @doc +%%% Logger filter для otel_logs handler: конвертирует otel_trace_id и otel_span_id +%%% из hex-формата (32/16 символов) в raw bytes (16/8 байт), как требует OTLP LogRecord. +%%% opentelemetry hex_span_ctx возвращает hex, collector ожидает bytes. +%%% @end +-module(lim_otel_log_filter). + +-export([filter/2]). +-export([format_otp_report_utf8/1]). + +-spec filter(logger:log_event(), term()) -> logger:filter_return(). +filter(#{meta := Meta} = LogEvent, _FilterConfig) -> + case convert_otel_ids(Meta) of + Meta -> + LogEvent; + Meta1 -> + LogEvent#{meta => Meta1} + end. + +%% Конвертируем hex -> raw bytes только если формат hex (32/16 символов). +%% OTLP LogRecord: trace_id=16 bytes, span_id=8 bytes. +convert_otel_ids(#{otel_trace_id := TraceIdHex, otel_span_id := SpanIdHex} = Meta) -> + case {hex_to_trace_id_bytes(TraceIdHex), hex_to_span_id_bytes(SpanIdHex)} of + {TraceIdBytes, SpanIdBytes} when TraceIdBytes =/= undefined, SpanIdBytes =/= undefined -> + Meta#{otel_trace_id => TraceIdBytes, otel_span_id => SpanIdBytes}; + _ -> + %% Некорректный формат — убираем, чтобы otel_otlp_logs не отправил в OTLP + maps:without([otel_trace_id, otel_span_id, otel_trace_flags], Meta) + end; +convert_otel_ids(Meta) -> + Meta. + +%% logger:format_otp_report/1 возвращает chardata (часто list()), +%% из-за чего downstream JSON может сериализовать body как массив байт. +%% Явно приводим к UTF-8 binary(), чтобы body в OTel/Loki был строкой. +-spec format_otp_report_utf8(logger:report()) -> {unicode:chardata(), list()}. +format_otp_report_utf8(Report) -> + Bin = + try logger:format_otp_report(Report) of + {Format, Args} -> + unicode:characters_to_binary(io_lib:format(Format, Args)) + catch + _:_ -> + %% Не даём report_cb падать: fallback в печатное представление отчёта. + unicode:characters_to_binary(io_lib:format("~tp", [Report])) + end, + {"~ts", [Bin]}. + +hex_to_trace_id_bytes(Hex) when is_binary(Hex), byte_size(Hex) =:= 32 -> + try + <<(binary_to_integer(Hex, 16)):128>> + catch + _:_ -> undefined + end; +hex_to_trace_id_bytes(_) -> + undefined. + +hex_to_span_id_bytes(Hex) when is_binary(Hex), byte_size(Hex) =:= 16 -> + try + <<(binary_to_integer(Hex, 16)):64>> + catch + _:_ -> undefined + end; +hex_to_span_id_bytes(_) -> + undefined. diff --git a/apps/limiter/src/lim_otel_log_handler.erl b/apps/limiter/src/lim_otel_log_handler.erl new file mode 100644 index 0000000..0513cae --- /dev/null +++ b/apps/limiter/src/lim_otel_log_handler.erl @@ -0,0 +1,48 @@ +-module(lim_otel_log_handler). + +-export([log/2]). +-export([adding_handler/1]). +-export([removing_handler/1]). +-export([changing_config/3]). +-export([filter_config/1]). + +-spec log(logger:log_event(), map()) -> ok. +log(LogEvent, Config) -> + otel_log_handler:log(LogEvent, Config). + +-spec adding_handler(map()) -> {ok, map()} | {error, term()}. +adding_handler(Config) -> + otel_log_handler:adding_handler(merge_module_config(Config)). + +-spec removing_handler(map()) -> ok. +removing_handler(Config) -> + otel_log_handler:removing_handler(Config). + +-spec changing_config(set | update, map(), map()) -> {ok, map()} | {error, term()}. +changing_config(SetOrUpdate, OldConfig, NewConfig) -> + otel_log_handler:changing_config( + SetOrUpdate, + merge_module_config(OldConfig), + merge_module_config(NewConfig) + ). + +-spec filter_config(map()) -> map(). +filter_config(Config) -> + otel_log_handler:filter_config(Config). + +%% Переносим ключи из вложенного config в корневой map для otel_log_handler. +%% Только ключи, которые ожидает otel_log_handler — чтобы случайно не перезаписать +%% верхнеуровневые настройки logger (level, filters, filter_default и т.д.). +-define(OTEL_LOG_HANDLER_KEYS, [ + exporter, + report_cb, + max_queue_size, + scheduled_delay_ms, + exporting_timeout_ms +]). + +merge_module_config(#{config := ModuleConfig} = Config) when is_map(ModuleConfig) -> + OtelConfig = maps:with(?OTEL_LOG_HANDLER_KEYS, ModuleConfig), + maps:merge(Config, OtelConfig); +merge_module_config(Config) -> + Config. diff --git a/apps/limiter/src/limiter.app.src b/apps/limiter/src/limiter.app.src index 344e6c9..2a6584a 100644 --- a/apps/limiter/src/limiter.app.src +++ b/apps/limiter/src/limiter.app.src @@ -16,7 +16,8 @@ dmt_client, opentelemetry_api, opentelemetry_exporter, - opentelemetry + opentelemetry, + opentelemetry_experimental ]}, {mod, {limiter, []}}, {env, []} diff --git a/apps/limiter/src/limiter.erl b/apps/limiter/src/limiter.erl index e2788e2..9dbcf94 100644 --- a/apps/limiter/src/limiter.erl +++ b/apps/limiter/src/limiter.erl @@ -17,11 +17,18 @@ -spec start(normal, any()) -> {ok, pid()} | {error, any()}. start(_StartType, _StartArgs) -> - ok = setup_metrics(), - supervisor:start_link({local, ?MODULE}, ?MODULE, []). + case ensure_otel_log_handler() of + ok -> + ok = setup_metrics(), + supervisor:start_link({local, ?MODULE}, ?MODULE, []); + {error, Reason} -> + logger:error("Failed to add otel_logs handler: ~p", [Reason]), + {error, Reason} + end. -spec stop(any()) -> ok. stop(_State) -> + ok = flush_otel_logs(), ok. %% @@ -99,3 +106,62 @@ get_prometheus_route() -> setup_metrics() -> ok = woody_ranch_prometheus_collector:setup(), ok = woody_hackney_prometheus_collector:setup(). + +ensure_otel_log_handler() -> + case logger:get_handler_config(otel_logs) of + {ok, _} -> + ok; + _ -> + MaxQueue = application:get_env(limiter, otel_log_max_queue_size, 2048), + DelayMs = application:get_env(limiter, otel_log_scheduled_delay_ms, 1000), + TimeoutMs = application:get_env(limiter, otel_log_exporting_timeout_ms, 300000), + LogLevel = application:get_env(limiter, otel_log_level, info), + HandlerConfig = #{ + report_cb => fun lim_otel_log_filter:format_otp_report_utf8/1, + exporter => + {otel_exporter_logs_otlp, #{ + protocol => http_protobuf, + ssl_options => [] + }}, + max_queue_size => MaxQueue, + scheduled_delay_ms => DelayMs, + exporting_timeout_ms => TimeoutMs + }, + LoggerHandlerConfig = #{ + level => LogLevel, + filter_default => log, + filters => [{lim_otel_trace_id_bytes, {fun lim_otel_log_filter:filter/2, undefined}}], + config => HandlerConfig + }, + case logger:add_handler(otel_logs, lim_otel_log_handler, LoggerHandlerConfig) of + ok -> + ok; + {error, {already_exist, _}} -> + ok; + {error, Reason} -> + {error, {otel_log_handler_failed, Reason}} + end + end. + +%% @doc Ждём отправки буферизованных логов перед остановкой. +%% otel_log_handler батчит логи и отправляет по таймеру (scheduled_delay_ms). +%% Явного API для flush у otel_log_handler нет, поэтому ждём один полный цикл +%% батчинга + запас на сетевую отправку (export overhead). +-define(FLUSH_EXPORT_OVERHEAD_MS, 700). +-define(FLUSH_MAX_WAIT_MS, 5000). + +flush_otel_logs() -> + case logger:get_handler_config(otel_logs) of + {ok, HandlerCfg} -> + Config = maps:get(config, HandlerCfg, #{}), + DelayMs = maps:get( + scheduled_delay_ms, + Config, + maps:get(scheduled_delay_ms, HandlerCfg, 1000) + ), + _ = logger:info("otel_log_handler_flush"), + timer:sleep(erlang:min(?FLUSH_MAX_WAIT_MS, DelayMs + ?FLUSH_EXPORT_OVERHEAD_MS)), + ok; + _ -> + ok + end. diff --git a/compose.tracing.yaml b/compose.tracing.yaml index 2d3d320..2bb9628 100644 --- a/compose.tracing.yaml +++ b/compose.tracing.yaml @@ -1,33 +1,91 @@ +# UI: Grafana http://localhost:3000 (admin/admin) services: + # OpenTelemetry Collector: single OTLP endpoint, fans out to Tempo + Loki + otel-collector: + image: otel/opentelemetry-collector-contrib:0.112.0 + command: ["--config=/etc/otel/config.yaml"] + volumes: + - ./test/tracing/otel-collector-config.yaml:/etc/otel/config.yaml:ro + ports: + - "4317:4317" # OTLP gRPC + - "4318:4318" # OTLP HTTP + healthcheck: + test: ["CMD", "/otelcol-contrib", "--version"] + interval: 5s + timeout: 2s + retries: 20 + start_period: 5s + depends_on: + tempo: + condition: service_healthy + loki: + condition: service_healthy dmt: environment: &otlp_enabled OTEL_TRACES_EXPORTER: otlp + OTEL_LOGS_EXPORTER: otlp OTEL_TRACES_SAMPLER: parentbased_always_off OTEL_EXPORTER_OTLP_PROTOCOL: http_protobuf - OTEL_EXPORTER_OTLP_ENDPOINT: http://jaeger:4318 + OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4318 + OTEL_EXPORTER_OTLP_LOGS_ENDPOINT: http://otel-collector:4318/v1/logs + OTEL_SERVICE_NAME: dmt testrunner: environment: <<: *otlp_enabled - OTEL_SERVICE_NAME: limiter_testrunner + OTEL_SERVICE_NAME: limiter OTEL_TRACES_SAMPLER: parentbased_always_on depends_on: - jaeger: + dmt: + condition: service_healthy + liminator: + condition: service_healthy + otel-collector: condition: service_healthy + grafana: + condition: service_started - jaeger: - image: jaegertracing/all-in-one:1.47 - environment: - - COLLECTOR_OTLP_ENABLED=true + tempo: + image: grafana/tempo:2.6.1 + command: ["-config.file=/etc/tempo.yaml"] + volumes: + - ./test/tracing/tempo.yaml:/etc/tempo.yaml:ro + ports: + - 3200:3200 healthcheck: - test: "/go/bin/all-in-one-linux status" - interval: 2s - timeout: 1s + test: ["CMD-SHELL", "wget -q -O- http://localhost:3200/ready || exit 1"] + interval: 5s + timeout: 2s retries: 20 + start_period: 5s + + loki: + image: grafana/loki:3.1.1 + command: ["-config.file=/etc/loki/config.yaml"] + volumes: + - ./test/tracing/loki.yaml:/etc/loki/config.yaml:ro ports: - - 4317:4317 # OTLP gRPC receiver - - 4318:4318 # OTLP http receiver - - 5778:5778 - - 14250:14250 - - 16686:16686 + - 3100:3100 + healthcheck: + test: ["CMD-SHELL", "wget -q -O- http://localhost:3100/ready || exit 1"] + interval: 5s + timeout: 2s + retries: 20 + start_period: 5s + + grafana: + image: grafana/grafana:12.3.3 + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_USERS_ALLOW_SIGN_UP=false + volumes: + - ./test/tracing/grafana/provisioning:/etc/grafana/provisioning:ro + ports: + - 3000:3000 + depends_on: + loki: + condition: service_healthy + tempo: + condition: service_healthy diff --git a/config/sys.config b/config/sys.config index 28105ec..dd6ea58 100644 --- a/config/sys.config +++ b/config/sys.config @@ -1,5 +1,10 @@ [ {limiter, [ + %% OTEL log handler configuration + {otel_log_level, info}, + {otel_log_max_queue_size, 2048}, + {otel_log_scheduled_delay_ms, 1000}, + {otel_log_exporting_timeout_ms, 300000}, {ip, "::"}, {port, 8022}, {services, #{ @@ -106,6 +111,10 @@ {storage, scoper_storage_logger} ]}, + {opentelemetry, [ + {processors, [{otel_batch_processor, #{scheduled_delay_ms => 1000}}]} + ]}, + {prometheus, [ {collectors, [default]} ]} diff --git a/rebar.config b/rebar.config index 7aeabe0..e0a2316 100644 --- a/rebar.config +++ b/rebar.config @@ -37,9 +37,29 @@ {scoper, {git, "https://github.com/valitydev/scoper.git", {tag, "v1.1.0"}}}, {woody, {git, "https://github.com/valitydev/woody_erlang.git", {tag, "v1.1.0"}}}, {dmt_client, {git, "https://github.com/valitydev/dmt_client.git", {tag, "v2.0.3"}}}, - {opentelemetry_api, "1.4.0"}, - {opentelemetry, "1.5.0"}, - {opentelemetry_exporter, "1.8.0"} + + %% OpenTelemetry deps (otel_log_handler, otel_exporter_logs_otlp from opentelemetry_experimental) + {opentelemetry_api, "1.5.0"}, + {opentelemetry, "1.7.0"}, + {opentelemetry_exporter, + {git_subdir, "https://github.com/valitydev/opentelemetry-erlang.git", + {branch, "fix/otlp-common-charlist-string"}, "apps/opentelemetry_exporter"}}, + {opentelemetry_api_experimental, "0.5.1"}, + {opentelemetry_experimental, + {git_subdir, "https://github.com/valitydev/opentelemetry-erlang.git", + {branch, "fix/otlp-common-charlist-string"}, "apps/opentelemetry_experimental"}} +]}. + +%% opentelemetry_experimental из git требует ~> 0.5.2 и старые otel (1.4/1.5), +%% но opentelemetry_exporter требует 1.5.0/1.7.0. Переопределяем на новые версии. +{overrides, [ + {override, opentelemetry_experimental, [ + {deps, [ + {opentelemetry_api, "1.5.0"}, + {opentelemetry, "1.7.0"}, + {opentelemetry_api_experimental, "0.5.1"} + ]} + ]} ]}. %% XRef checks @@ -76,6 +96,7 @@ {runtime_tools, load}, {tools, load}, {opentelemetry, temporary}, + {opentelemetry_experimental, load}, {logger_logstash_formatter, load}, prometheus, prometheus_cowboy, diff --git a/rebar.lock b/rebar.lock index 207a40a..bf89b63 100644 --- a/rebar.lock +++ b/rebar.lock @@ -45,6 +45,14 @@ {<<"mimerl">>,{pkg,<<"mimerl">>,<<"1.4.0">>},2}, {<<"opentelemetry">>,{pkg,<<"opentelemetry">>,<<"1.5.0">>},0}, {<<"opentelemetry_api">>,{pkg,<<"opentelemetry_api">>,<<"1.4.0">>},0}, + {<<"opentelemetry_api_experimental">>, + {pkg,<<"opentelemetry_api_experimental">>,<<"0.5.1">>}, + 0}, + {<<"opentelemetry_experimental">>, + {git_subdir,"https://github.com/valitydev/opentelemetry-erlang.git", + {ref,"685389b35fb188166e072c389a487a8e5a9f0804"}, + "apps/opentelemetry_experimental"}, + 0}, {<<"opentelemetry_exporter">>, {pkg,<<"opentelemetry_exporter">>,<<"1.8.0">>}, 0}, @@ -99,6 +107,7 @@ {<<"mimerl">>, <<"3882A5CA67FBBE7117BA8947F27643557ADEC38FA2307490C4C4207624CB213B">>}, {<<"opentelemetry">>, <<"7DDA6551EDFC3050EA4B0B40C0D2570423D6372B97E9C60793263EF62C53C3C2">>}, {<<"opentelemetry_api">>, <<"63CA1742F92F00059298F478048DFB826F4B20D49534493D6919A0DB39B6DB04">>}, + {<<"opentelemetry_api_experimental">>, <<"1B5AFACFCBD0834390336C845BC8AE08C8CF0D69BBED72EE53D178798B93E074">>}, {<<"opentelemetry_exporter">>, <<"5D546123230771EF4174E37BEDFD77E3374913304CD6EA3CA82A2ADD49CD5D56">>}, {<<"parse_trans">>, <<"16328AB840CC09919BD10DAB29E431DA3AF9E9E7E7E6F0089DD5A2D2820011D8">>}, {<<"prometheus">>, <<"B95F8DE8530F541BD95951E18E355A840003672E5EDA4788C5FA6183406BA29A">>}, @@ -128,6 +137,7 @@ {<<"mimerl">>, <<"13AF15F9F68C65884ECCA3A3891D50A7B57D82152792F3E19D88650AA126B144">>}, {<<"opentelemetry">>, <<"CDF4F51D17B592FC592B9A75F86A6F808C23044BA7CF7B9534DEBBCC5C23B0EE">>}, {<<"opentelemetry_api">>, <<"3DFBBFAA2C2ED3121C5C483162836C4F9027DEF469C41578AF5EF32589FCFC58">>}, + {<<"opentelemetry_api_experimental">>, <<"10297057EADA47267D4F832011BECEF07D25690E6BF91FEBCCFC4E740DBA1A6F">>}, {<<"opentelemetry_exporter">>, <<"A1F9F271F8D3B02B81462A6BFEF7075FD8457FDB06ADFF5D2537DF5E2264D9AF">>}, {<<"parse_trans">>, <<"07CD9577885F56362D414E8C4C4E6BDF10D43A8767ABB92D24CBE8B24C54888B">>}, {<<"prometheus">>, <<"719862351AABF4DF7079B05DC085D2BBCBE3AC0AC3009E956671B1D5AB88247D">>}, diff --git a/test/tracing/grafana/provisioning/datasources/datasources.yaml b/test/tracing/grafana/provisioning/datasources/datasources.yaml new file mode 100644 index 0000000..d89d544 --- /dev/null +++ b/test/tracing/grafana/provisioning/datasources/datasources.yaml @@ -0,0 +1,39 @@ +apiVersion: 1 + +datasources: + - name: Loki + uid: loki + type: loki + access: proxy + url: http://loki:3100 + isDefault: true + jsonData: + maxLines: 5000 + derivedFields: + # По structured metadata (Fields) — trace_id из OTLP не в теле лога + - name: trace_id + matcherType: label + matcherRegex: '^trace_id$' + datasourceUid: tempo + url: '$${__value.raw}' + urlDisplayLabel: 'View Trace' + + - name: Tempo + uid: tempo + type: tempo + access: proxy + url: http://tempo:3200 + jsonData: + tracesToLogsV2: + datasourceUid: loki + spanStartTimeShift: -5m + spanEndTimeShift: 5m + tags: ['service.name', 'service_name'] + # OTel: запрос к Loki по trace_id (structured metadata). service_name=~".+" — обязательный + # matcher (Loki требует хотя бы один непустой matcher). + customQuery: true + query: '{service_name=~".+"} | trace_id="$${__span.traceId}"' + serviceMap: + datasourceUid: tempo + nodeGraph: + enabled: true diff --git a/test/tracing/loki.yaml b/test/tracing/loki.yaml new file mode 100644 index 0000000..af9dcae --- /dev/null +++ b/test/tracing/loki.yaml @@ -0,0 +1,38 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + # Увеличиваем лимит gRPC (по умолчанию 4MB), чтобы большие ответы querier->frontend не падали с ResourceExhausted + grpc_server_max_recv_msg_size: 104857600 # 100 MiB + grpc_server_max_send_msg_size: 104857600 # 100 MiB + +common: + path_prefix: /tmp/loki + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +storage_config: + filesystem: + directory: /tmp/loki/chunks + +limits_config: + allow_structured_metadata: true + volume_enabled: true + +pattern_ingester: + enabled: true + +ruler: + enable_api: true diff --git a/test/tracing/otel-collector-config.yaml b/test/tracing/otel-collector-config.yaml new file mode 100644 index 0000000..c889896 --- /dev/null +++ b/test/tracing/otel-collector-config.yaml @@ -0,0 +1,48 @@ +# OpenTelemetry Collector: receives OTLP, exports traces to Tempo, logs to Loki +# Логи в Loki — через нативный OTLP (/otlp), чтобы атрибуты логов попадали в Fields (structured metadata). +# Устаревший loki exporter (/loki/api/v1/push) атрибуты в Loki не передаёт. +receivers: + otlp: + protocols: + grpc: + endpoint: "0.0.0.0:4317" + max_recv_msg_size_mib: 100 # по умолчанию 4 MiB — большие батчи дают ResourceExhausted + http: + endpoint: "0.0.0.0:4318" + +exporters: + # Traces -> Tempo + otlp/tempo: + endpoint: "tempo:4317" + tls: + insecure: true + + # Logs -> Loki (нативный OTLP: атрибуты лога → Structured Metadata → Fields в Grafana) + otlphttp/loki: + endpoint: "http://loki:3100/otlp" + tls: + insecure: true + retry_on_failure: + enabled: true + max_elapsed_time: 30s + +processors: + batch: + send_batch_size: 512 + timeout: 2s + +extensions: + health_check: + endpoint: "0.0.0.0:13133" + +service: + extensions: [health_check] + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [otlp/tempo] + logs: + receivers: [otlp] + processors: [batch] + exporters: [otlphttp/loki] diff --git a/test/tracing/tempo.yaml b/test/tracing/tempo.yaml new file mode 100644 index 0000000..aa4933d --- /dev/null +++ b/test/tracing/tempo.yaml @@ -0,0 +1,41 @@ +server: + http_listen_port: 3200 + # Лимиты gRPC (по умолчанию 4MB), чтобы большие трейсы/ответы не давали ResourceExhausted + grpc_server_max_recv_msg_size: 104857600 # 100 MiB + grpc_server_max_send_msg_size: 104857600 # 100 MiB + +distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + max_recv_msg_size_mib: 100 + http: + endpoint: 0.0.0.0:4318 + +ingester: + max_block_duration: 5m + +compactor: + compaction: + block_retention: 24h + +storage: + trace: + backend: local + local: + path: /tmp/tempo/traces + +metrics_generator: + registry: + external_labels: + source: tempo + cluster: docker-compose + storage: + path: /tmp/tempo/generator/wal + +overrides: + defaults: + metrics_generator: + processors: [service-graphs, span-metrics]