SpecterOps
diff --git a/‎.gitignore‎
Lines changed: 6 additions & 0 deletions b/‎.gitignore‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎compose.prod.build.yaml‎
Lines changed: 3 additions & 0 deletions b/‎compose.prod.build.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎compose.yaml‎
Lines changed: 17 additions & 6 deletions b/‎compose.yaml‎
Lines changed: 17 additions & 6 deletions
diff --git a/‎docs/enrichment_configuration.md‎
Lines changed: 9 additions & 9 deletions b/‎docs/enrichment_configuration.md‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎docs/file_enrichment_modules.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/file_enrichment_modules.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/performance.md‎
Lines changed: 47 additions & 41 deletions b/‎docs/performance.md‎
Lines changed: 47 additions & 41 deletions
@@ -182,3 +182,9 @@ projects/dotnet_service/bin/
 projects/jupyter/notebooks/*.csv
 
 projects/frontend/public/env.js
+
+# Vim swap files
+*.swp
+*.swo
+*.swn
+*~
@@ -31,12 +31,15 @@ services:
       dockerfile: ./projects/file_enrichment/Dockerfile
       target: prod
 
+  ### Replica #1
   # file-enrichment-1:
   #   <<: *file-enrichment-build
 
+  ### Replica #2
   # file-enrichment-2:
   #   <<: *file-enrichment-build
 
+  ### Replica #3
   # file-enrichment-3:
   #   <<: *file-enrichment-build
 
 
@@ -324,6 +324,7 @@ services:
       rabbitmq: { condition: service_healthy }
     network_mode: "service:file-enrichment"
 
+  ### Replica #1
   # file-enrichment-1:
   #   <<: *file-enrichment-template
   #   environment:
@@ -338,6 +339,7 @@ services:
   #     rabbitmq: { condition: service_healthy }
   #   network_mode: "service:file-enrichment-1"
 
+  ### Replica #2
   # file-enrichment-2:
   #   <<: *file-enrichment-template
   #   environment:
@@ -352,6 +354,7 @@ services:
   #     rabbitmq: { condition: service_healthy }
   #   network_mode: "service:file-enrichment-2"
 
+  ### Replica #3
   # file-enrichment-3:
   #   <<: *file-enrichment-template
   #   environment:
@@ -548,7 +551,7 @@ services:
       #   - TIKA_OCR_LANGUAGES=${TIKA_OCR_LANGUAGES:-eng chi_sim chi_tra jpn rus deu spa}
       # Note: each package installed will increase the image size!
       - TIKA_OCR_LANGUAGES=${TIKA_OCR_LANGUAGES:-eng}
-      - MAX_PARALLEL_WORKFLOWS=${DOCUMENTCONVERSION_WORKERS:-5}
+      - MAX_PARALLEL_WORKFLOWS=${DOCUMENTCONVERSION_MAX_PARALLEL_WORKFLOWS:-5}
       - MAX_WORKFLOW_EXECUTION_TIME=${MAX_WORKFLOW_EXECUTION_TIME:-300}
       - LOG_LEVEL=${LOG_LEVEL:-INFO}
       - OMP_THREAD_LIMIT=1  # Limit the number of tesseract instances
@@ -707,7 +710,7 @@ services:
     restart: "no"
 
   rabbitmq:
-    image: rabbitmq:4.1.2-management
+    image: rabbitmq:4.2.0-management
     hostname: rabbitmq-node # have to do this for persistence reasons
     environment:
       - RABBITMQ_DEFAULT_PASS=${RABBITMQ_PASSWORD:?}
@@ -717,6 +720,7 @@ services:
     volumes:
       - rabbitmq_data:/var/lib/rabbitmq
       - ./infra/rabbitmq/enabled_plugins:/etc/rabbitmq/enabled_plugins:ro
+      - ./infra/rabbitmq/rabbitmq.conf:/etc/rabbitmq/rabbitmq.conf:ro
     healthcheck:
       test: ["CMD", "rabbitmq-diagnostics", "check_port_connectivity"]
       interval: 10s
@@ -730,7 +734,7 @@ services:
       - "traefik.http.routers.rabbitmq-ui.rule=PathPrefix(`/rabbitmq`)"
 
   postgres:
-    image: postgres:17.6-alpine
+    image: postgres:18.1
     command:
       [
         "postgres",
@@ -740,14 +744,14 @@ services:
         "-c", "pg_stat_statements.track=all"
       ]
     ports:
-      - "5432:5432"
+      - "${POSTGRES_EXTERNAL_PORT:-}5432"
     environment:
       POSTGRES_DB: ${POSTGRES_DB:-enrichment}
       POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:?}
       POSTGRES_USER: ${POSTGRES_USER:?}
     volumes:
       - ./infra/postgres:/docker-entrypoint-initdb.d:ro
-      - postgres_data:/var/lib/postgresql/data
+      - postgres_data:/var/lib/postgresql
     healthcheck:
       test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER} -d enrichment"]
       interval: 5s
@@ -792,7 +796,7 @@ services:
       - "traefik.http.routers.hasura.tls=true"
 
   traefik:
-    image: traefik:v3.4.4
+    image: traefik:v3.6.1
     command:
       - "--api.insecure=true"
       # - "--log.level=DEBUG"
@@ -882,8 +886,15 @@ services:
     profiles: ["monitoring"]
     image: jaegertracing/jaeger:latest # v2.x image
     user: "0:0"
+    deploy:
+      resources:
+        limits:
+          memory: 2G
+        reservations:
+          memory: 1G
     environment:
       - QUERY_BASE_PATH=/jaeger
+      - GOGC=80
     volumes:
       - ./infra/jaeger/jaeger-config.yaml:/etc/jaeger/config.yaml
       - jaeger_data:/badger # Persist trace data
 
@@ -30,20 +30,20 @@ Currently the PII module detects the following entity types: `CREDIT_CARD`, `US_
 
 The [Document Conversion service](https://github.com/SpecterOps/Nemesis/tree/main/projects/document_conversion) has several ENV variables variable that can be passed through from the environment launching Nemesis, or modified in [compose.yaml](https://github.com/SpecterOps/Nemesis/blob/main/compose.yaml):
 
-| ENV Variable                  | Default Value | Description                                                     |
-| ----------------------------- | ------------- | --------------------------------------------------------------- |
-| `MAX_PARALLEL_WORKFLOWS`      | 5             | Maxmimum number of parallel conversion workflows allows         |
-| `MAX_WORKFLOW_EXECUTION_TIME` | 300           | Maximum time (in seconds) workflows can run before being killed |
-| `TIKA_USE_OCR`                | false         | Set to `true` to enable OCR support via Tessaract               |
-| `TIKA_OCR_LANGUAGES`          | eng           | Tika/Tesseract OCR languages supported.                         |
+| ENV Variable                                | Default Value | Description                                                     |
+|---------------------------------------------|---------------|-----------------------------------------------------------------|
+| `DOCUMENTCONVERSION_MAX_PARALLEL_WORKFLOWS` | 5             | Maxmimum number of parallel conversion workflows allows         |
+| `MAX_WORKFLOW_EXECUTION_TIME`               | 300           | Maximum time (in seconds) workflows can run before being killed |
+| `TIKA_USE_OCR`                              | false         | Set to `true` to enable OCR support via Tessaract               |
+| `TIKA_OCR_LANGUAGES`                        | eng           | Tika/Tesseract OCR languages supported.                         |
 
-If you want to have additional language packs supported (see https://github.com/tesseract-ocr/tessdata for a full list), run something like this before launching Nemesis or set the value in your .env :
+If you want to have additional language packs supported (see https://github.com/tesseract-ocr/tessdata for a full list), run something like this before launching Nemesis or set the value in your `.env` file:
 
 ```bash
 export TIKA_OCR_LANGUAGES="eng chi_sim chi_tra jpn rus deu spa"
 ```
 
-**NOTE:** due to Docker's ENV variable substitution, setting `TIKA_USE_OCR=false` will be interpreted as true - either removing `TIKA_USE_OCR` from an .env file or setting `TIKA_USE_OCR=""` will disable OCR (the default).
+**NOTE:** due to Docker's ENV variable substitution, setting `TIKA_USE_OCR=false` will be interpreted as true - either removing `TIKA_USE_OCR` from an .env file or setting `TIKA_USE_OCR=""` will disable OCR (the default). Enabling OCR significantly increases CPU as it will OCR standalone images as well as all images embedded in documents.
 
 ## Nosey Parker
 
@@ -52,7 +52,7 @@ export TIKA_OCR_LANGUAGES="eng chi_sim chi_tra jpn rus deu spa"
 The [Nosey Parker scanner service](https://github.com/SpecterOps/Nemesis/tree/main/projects/noseyparker_scanner) has several ENV variables variable that can be passed through from the environment launching Nemesis, or modified in [compose.yaml](https://github.com/SpecterOps/Nemesis/blob/main/compose.yaml):
 
 | ENV Variable           | Default Value | Description                                                                     |
-| ---------------------- | ------------- | ------------------------------------------------------------------------------- |
+|------------------------|---------------|---------------------------------------------------------------------------------|
 | `SNIPPET_LENGTH`       | 512           | Bytes of context length around Nosey Parker matches to pull in for findings     |
 | `MAX_CONCURRENT_FILES` | 2             | Maximum number of concurrent files to scan (raising increases resources needed) |
 | `MAX_FILE_SIZE_MB`     | 200           | Maximum file size to scan (in megabytes)                                        |
 
@@ -29,7 +29,7 @@ Then in this folder, run `poetry add X` to add a new library. The dynamic module
 
 ## Tips / Tricks
 
-The async `should_process()` function  determines if the module should run on a file. You can either check the name or any other component of the base enriched file with `file_enriched = get_file_enriched(object_id)`:
+The async `should_process()` function  determines if the module should run on a file. You can either check the name or any other component of the base enriched file with `file_enriched = await get_file_enriched(object_id, self.asyncpg_pool)`:
 
 ```python
 ...
 
@@ -1,59 +1,65 @@
 # Nemesis Performance Tuning
 
-This document details different ways to monitor and tune Nemesis's performance. Nemesis's performs differently depending on a variety of factors, including the host's architecture and resources (particularly CPU and RAM) and the workload (e.g. the number of files and imbalances in the the number of documents, .NET assemblies, source code, etc.).
+This document details different ways to monitor and tune Nemesis's performance. Nemesis performs differently depending on a variety of factors, including the host's architecture and resources (particularly CPU, RAM, and disk speed) and the workload (e.g. the # of files and imbalances in the number of documents, .NET assemblies, source code, etc.).
 
-If workflows begin to fail, or you are experiencing major performance issues (as diagnosed by the [Troubleshooting](troubleshooting.md) document) there are a few tunable parameters that can help. Alternatively, if your performance is fine already and you want to potentially increase performance more or potentially reduce CPU/RAM usage (to save $$$), you can adjust these values. Most/all of these values involve altering behaviors the docker services responsible for file enrichment, namely the `file-enrichment`, `document-conversion`, and `noseyparker-scanner` services.
+If workflows begin to fail, or you are experiencing major performance issues, there are a few tunable settings that can help. Alternatively, if your performance is fine already and you want to potentially increase performance more or potentially reduce CPU/RAM usage (to save $$$), you can adjust these values. This document primarily focuses on increasing performance, but you can of course adjust the settings down to decrease resources.
+
+# Hardware Resourcing
+The first thing to check is if Nemesis has enough hardware resources.
+
+## CPU
+
+Under load, monitor CPU usage (e.g. with `top`/`htop` or the "Node Exporter" Grafana dashboard (if monitoring is enabled in Nemesis). If all cores are at lower utilization or not maxed out, continue following through this guide. Otherwise, you'll need to increase CPU resources for Nemesis since Nemesis is primarily CPU bound.
+
+## RAM
+Under load, monitor RAM usage (e.g. with `top`/`htop`, `free -h`, or the "Node Exporter" Grafana dashboard if monitoring is enabled in Nemesis). Ensure that all memory is not being used; otherwise, you will need to increase RAM.
+
+Note that Nemesis will buffer/cache memory if it can. Minio in particular will use any available RAM to cache file data in RAM. This memory is reclaimable, and therefore is still useable by other services/applications. We recommend having at least 1Gb of cache memory available. More may improve performance, but for the most part Nemesis is CPU bound, not RAM bound. You can apply [docker compose memory limits](https://docs.docker.com/reference/compose-file/deploy/#resources) to specific services if you want to constrain how much RAM minio consumes.
+
+## Disk
+The requirements will vary widely here depending on your workload size. A general rule of thumb is 3x the size of all the files being uploaded. Use SSDs if possible.
 
 # Analyzing Your Workload
 ## Analyzing Queues
-Normally people realize Nemesis isn't going fast enough after uploading a bunch of files and it taking forever to process. Usually this is indicative that a bunch of files get queued up for processing, but aren't be processed fast enough. You can confirm this by [analyzing the message queues](./troubleshooting.md#analyze-message-queues).
+Normally people realize Nemesis isn't going fast enough after uploading a bunch of files and it taking forever to process. Usually this is indicative that files get queued up for processing, but aren't processed fast enough. You can confirm this by [analyzing the message queues](./troubleshooting.md#analyze-message-queues) in Nemesis/RabbitMQ.
+
+In RabbitMQ, `Ready` counts signify messages waiting to be processed and the `delivery / get` rates (messages per second) will give you an idea of the processing speed. The following table maps the service to queue mappings:
+
+| Docker Service      | Queue Name                      | Description                                                                               |
+|---------------------|---------------------------------|-------------------------------------------------------------------------------------------|
+| file_enrichment     | files-new_file                  | Uploaded files that haven't begun processing                                              |
+| document_conversion | files-document_conversion_input | Files waiting to go through document_conversion (strings, text extraction, PDF conversion) |
+| dotnet_service      | dotnet-dotnet_input             | Files waiting for .NET decompilation and inspect assembly                                 |
+| noseyparker-scanner | noseyparker-noseyparker_input   | Files waiting to be scanned by noseyparker                                                |
+
+If the queue message rates are too slow, you can adjust some settings to try and increase performance. The following sections detail the best bang-for-the-buck service-specific adjustments you can make.
 
-The message queue Ready counts (messages waiting to be processed) and rate of delivery will give you an idea of 
-- 
+### file_enrichment
+Every uploaded file is first placed on the `files-new_file` queue. The file_enrichment service consumes files from the queue and processes each one with the [applicable enrichment modules](https://github.com/SpecterOps/Nemesis/tree/main/libs/file_enrichment_modules). To improve file_enrichment performance, analyze its CPU usage with `docker compose stats file-enrichment` or in the "Docker Monitoring" dashboard in Grafana. 
 
+The first thing to tune is making sure file_enrichment is efficiently using a single core (currently, the file_enrichment service does not take full advantage of parallelism). Good utilization will look like ~90-110% CPU usage. i.e. the worker thread is taking full advantage of a single core. If CPU utilization is low, increase the number of workers with the `ENRICHMENT_MAX_PARALLEL_WORKFLOWS` environment variable (default is 5, meaning 5 workers). You'll also want to make sure this isn't set too high, causing workers to compete for CPU amongst themselves. If you increase to ~100 workers, then you'll also need to adjust Dapr's RabbitMQ `prefetchCount` count in [file.yaml](https://github.com/SpecterOps/Nemesis/blob/main/infra/dapr/components/pubsub/files.yaml).
 
+If additional cores are available, you can scale the file_enrichment container by adding replicas. Do this by modifying both the [compose.yaml](https://github.com/SpecterOps/Nemesis/blob/main/compose.yaml#L327) and [compose.prod.yaml](https://github.com/SpecterOps/Nemesis/blob/main/compose.prod.build.yaml#L34) files, uncommenting the disabled `file-enrichment-###` placeholder replicas therein. Feel free to add more by following the same pattern, if wanted.
 
-# File Submission
+### document_conversion
+Every file is added to the `files-document_conversion_input` queue. The document_conversion service consumes files from the queue and extracts text, runs `strings` on the file, and converts documents to PDFs. To improve document_conversion performance, analyze its CPU usage with `docker compose stats document-conversion` or in the "Docker Monitoring" dashboard in Grafana. The document_conversion service can take full advantage of parallelism (so adding replicas is not necessary since a single instance can utilize multiple cores). However, the [compose.yaml](https://github.com/SpecterOps/Nemesis/blob/main/compose.yaml#L565) has [resource limits](https://docs.docker.com/reference/compose-file/deploy/#resources) that restrict the document-conversion service to 2 cores by default (adjust it if needed). In addition, you can adjust the `DOCUMENTCONVERSION_MAX_PARALLEL_WORKFLOWS` environment variable to adjust the number of workers (2 workers by default).
 
+### noseyparker-scanner
+Every text file is added to the `noseyparker-noseyparker_input` queue. The noseyparker-scanner service consumes files from the queue and scans them with noseyparker. To improve noseyparker-scanner performance, analyze its CPU usage with `docker compose stats noseyparker-scanner` or in the "Docker Monitoring" dashboard in Grafana. The noseyparker-scanner service can take full advantage of parallelism (so adding replicas is not necessary since a single instance can utilize multiple cores). However, the [compose.yaml](https://github.com/SpecterOps/Nemesis/blob/main/compose.yaml#L129) has [resource limits](https://docs.docker.com/reference/compose-file/deploy/#resources) that restrict the noseyparker-scanner service to 2 cores by default (adjust it if needed). In addition, you can adjust the `NOSEYPARKER_MAX_CONCURRENT_FILES` environment variable to adjust the number of workers (2 workers by default).
 
 
+# Dapr Scaling
+Nemesis uses [Dapr Workflows](https://docs.dapr.io/developing-applications/building-blocks/workflow/workflow-overview/) to build durable and reliable enrichment pipelines. Underneath, the workflows are managed by Dapr's scheduler service, which shares usage of the Postgres database with Nemesis.
 
-# Useful Prometheus Metrics
-Minio
-```
-minio_cluster_usage_objects_count{}
-```
+You may need to scale the Dapr infrastructure if you considerably increase the performance of the file_enrichment and/or document_conversion services. Scaling Dapr is beyond the scope of this document, but here's some indicators when you may need to:
+- Significant sustained CPU usage (> 80%-90%) by the scheduler container and/or Postgres container.
+- Workflows begin failing frequently.
+- You notice frequent activity failures/retries in Jaeger traces.
 
+If this is the case, first try increasing the number of scheduler instances ([example](https://github.com/olitomlinson/dapr-workflow-testing/blob/main/compose-1-3.yml#L111-L152)). Dapr does not support more than 3 scheduler instances unless you migrate to using [an external etcd store](https://docs.dapr.io/concepts/dapr-services/scheduler/#external-etcd-database). If Postgres begins to be the bottleneck, you may consider using a separate Postgres instance to store Dapr state.
 
-# Jaeger
-See how long a particular activity takes:
-```
-curl -sk --user 'n:n' "https://localhost:7443/jaeger/api/traces?service=file-enrichment&operation=activity%7C%7Crun_enrichment_modules&limit=2000" | jq -r '
-  [
-    .data[]
-    .spans[]
-    | select(.operationName == "activity||run_enrichment_modules")
-    | .duration
-  ] as $durs
-  | {
-      count: ($durs | length),
-      min_ms: ($durs | min / 1000),
-      max_ms: ($durs | max / 1000),
-      avg_ms: (($durs | add / ($durs | length)) / 1000)
-    }
-'
-```
+Additional resources:
+- [Tuning Dapr Scheduler for Production](https://www.diagrid.io/blog/tuning-dapr-scheduler-for-production)
+- [Dapr Scheduler control plane service overview](https://docs.dapr.io/concepts/dapr-services/scheduler/)
 
 
-# TODO: Need to document these
-- Adjust enrichment workers
-- Add some metrics around throughput and RAM/CPU consumption
-- Adjust dapr workflow/activity concurrency settings
-- How to tune your system to the right settings
-  - Seeing gaps between activities in Jaeger
-  - Useful Grafana/prometheus metrics
-- Scaling the Dapr scheduler:
-  - How to determine if scheduler is slow down
-  - Creating scheduler replicas
-  - Disk performance recommendations (Fast/ramdisk suggestion)
-  - Use separate DB instance from app