diff --git a/.github/workflows/bandit.yml b/.github/workflows/bandit.yml new file mode 100644 index 0000000..e7b0700 --- /dev/null +++ b/.github/workflows/bandit.yml @@ -0,0 +1,52 @@ +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +# Bandit is a security linter designed to find common security issues in Python code. +# This action will run Bandit on your codebase. +# The results of the scan will be found under the Security tab of your repository. + +# https://github.com/marketplace/actions/bandit-scan is ISC licensed, by abirismyname +# https://pypi.org/project/bandit/ is Apache v2.0 licensed, by PyCQA + +name: Bandit +on: + push: + branches: [ "master" ] + pull_request: + # The branches below must be a subset of the branches above + branches: [ "master" ] + schedule: + - cron: '33 20 * * 3' + +jobs: + bandit: + permissions: + contents: read # for actions/checkout to fetch code + security-events: write # for github/codeql-action/upload-sarif to upload SARIF results + actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status + + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Bandit Scan + uses: shundor/python-bandit-scan@ab1d87dfccc5a0ffab88be3aaac6ffe35c10d6cd + with: # optional arguments + # exit with 0, even with results found + exit_zero: true # optional, default is DEFAULT + # Github token of the repository (automatically created by Github) + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Needed to get PR information. + # File or directory to run bandit on + # path: # optional, default is . + # Report only issues of a given severity level or higher. Can be LOW, MEDIUM or HIGH. Default is UNDEFINED (everything) + # level: # optional, default is UNDEFINED + # Report only issues of a given confidence level or higher. Can be LOW, MEDIUM or HIGH. Default is UNDEFINED (everything) + # confidence: # optional, default is UNDEFINED + # comma-separated list of paths (glob patterns supported) to exclude from scan (note that these are in addition to the excluded paths provided in the config file) (default: .svn,CVS,.bzr,.hg,.git,__pycache__,.tox,.eggs,*.egg) + # excluded_paths: # optional, default is DEFAULT + # comma-separated list of test IDs to skip + # skips: # optional, default is DEFAULT + # path to a .bandit file that supplies command line arguments + # ini_path: # optional, default is DEFAULT + diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..c80aea1 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,38 @@ +name: CI + +on: + push: + branches: [master] + pull_request: + branches: [master] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install ruff + run: pip install "ruff>=0.4" + + - name: Ruff check + run: ruff check learn_upload/ tests/ scripts/ + + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: pip install -r requirements-dev.txt + + - name: Run tests + run: python -m pytest tests/ -v diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..e283ea6 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,36 @@ +name: Release + +on: + push: + tags: ["v*"] + +jobs: + build: + runs-on: windows-latest + permissions: + contents: write + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: pip install pydicom PyQt6 numpy pyinstaller + + - name: Build exe + run: pyinstaller learn_upload.spec + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: learn_upload + path: dist/learn_upload.exe + + - name: Create GitHub Release + uses: softprops/action-gh-release@v2 + with: + files: dist/learn_upload.exe + generate_release_notes: true diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cc717f6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,95 @@ +# Patient Data Protection - CRITICAL: Never commit patient data +patient-confidential/ +**/patient-confidential/ +*patient*confidential* +*confidential* + +# Patient data files that might appear elsewhere +*patient*.csv +*Patient*.csv +*PATIENT*.csv +patient_*.csv +Patient_*.csv + +# Python artifacts +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# Virtual environments +venv/ +env/ +ENV/ +.venv/ +.env/ + +# IDE and editor files +.vscode/ +.idea/ +.vs/ +*.swp +*.swo +*~ +.DS_Store +Thumbs.db + +# Temporary files +*.tmp +*.temp +*.log +*.bak +*.backup +*.docx + +# Medical imaging data files (potential patient data) +*.dcm +*.DCM +*.his +*.HIS + +# Large data directories +data/ +Data/ +DATA/ +XVI_COLLECTION/ +*XVI*COLLECTION* +Docs/Prostate/ + +# Output files that might contain patient data +output/ +Output/ +results/ +Results/ +exports/ +Exports/ + +# Jupyter notebook checkpoints +.ipynb_checkpoints/ + +# System files +.DS_Store +Thumbs.db +desktop.ini + +# Claude Code configuration +CLAUDE.md +.claude/ \ No newline at end of file diff --git a/Docs/Elekta_XVI_Reconstruction_Directory_Analysis.md b/Docs/Elekta_XVI_Reconstruction_Directory_Analysis.md new file mode 100644 index 0000000..61408c5 --- /dev/null +++ b/Docs/Elekta_XVI_Reconstruction_Directory_Analysis.md @@ -0,0 +1,176 @@ +# Elekta XVI Reconstruction Directory Analysis + +## Overview + +The Reconstruction directory comes from the Elekta XVI CBCT (Cone Beam Computed Tomography) imaging system export. This directory contains the reconstructed volumetric data and associated configuration files generated from the projection images (`.his` files) acquired during CBCT scanning on the VersaHD linear accelerator. + +Export procedure to get this is in + +The XVI system performs cone beam CT imaging for patient positioning verification and adaptive radiotherapy workflows. The Reconstruction directory stores the final 3D volumes along with all the technical parameters used during the reconstruction process. + +## Directory Structure + +Each patient's CBCT acquisition follows this structure: +``` +patient_XXXXXXXX/ +├── IMAGES/ +│ └── img_[UID]/ +│ ├── _Frames.xml # Projection acquisition parameters +│ ├── 00001.[UID].his # Projection image files +│ ├── 00002.[UID].his # ... +│ ├── ... # Additional projection files +│ └── Reconstruction/ # ← FOCUS OF THIS ANALYSIS +│ ├── [UID].INI # XVI software configuration +│ ├── [UID].INI.XVI # Reconstruction parameters +│ ├── [UID].[timestamp].INI # Session-specific configuration +│ ├── [UID].[timestamp].INI.XVI # Session-specific registratoin details +│ ├── [UID].[timestamp].SCAN # Reconstructed volume data +│ ├── [UID].[timestamp].SCAN.MACHINEORIENTATION # Coordinate transformation +│ └── [UID].RPS.dcm # DICOM registration data (optional) +``` + +## File Types and Contents + +### 1. Configuration Files (.INI) + +**Files**: `[UID].INI`, `[UID].[timestamp].INI` + +These are XVI software configuration files containing: + +#### XVI Software Settings +- User interface state and visibility settings +- Display parameters and zoom factors +- Administrative and logging configuration +- Multi-threading and refresh interval settings + +#### Patient Identification +```ini +[IDENTIFICATION] +PatientID=15002197 +TreatmentID=WholeBrain-C2Retrt +TreatmentUID=1.3.46.423632.33783920233217242713.224 +ReferenceUID=1.2.840.113854.59112832676204369253232190232540417741 +FirstName=Anonymized +LastName=Anonymized +``` + +#### Directory Paths +- `AdministrativeFilesDirectory`: Path to reconstruction admin files +- `ReferenceCacheDirectory`: CT reference image location +- `ProjectionDirectory`: Source projection files location +- `ReconstructedScansDirectory`: Output reconstruction location + +#### Clinical Context +- Treatment plan description and patient information +- Links to treatment planning system via UIDs +- Status line text showing active treatment plan + +### 2. Reconstruction Parameters (.INI.XVI) + +**Files**: `[UID].INI.XVI`, `[UID].[timestamp].INI.XVI` + +These files contain technical reconstruction parameters: + +#### Volume Specifications +- `ReconstructionDimensionX/Y/Z`: Voxel dimensions (typically 264×270×270) +- `ReconstructionVoxelSize`: Spatial resolution (typically 0.1 cm) +- `ReconstructionOffsetX/Y/Z`: Volume positioning offsets + +#### Image Processing Parameters +- `ProjectionImageDimension`: Detector size (typically 256×256) +- `ReconstructionFilter`: Applied filter type (e.g., "Wiener") +- `ReconstructionFilterParameters`: Filter-specific settings +- `ScatterCorrectionAlg`: Scatter correction algorithm +- `BowTieScatterCorrection`: Hardware scatter reduction + +#### Technical Settings +- `ReconstructionDataType`: Internal data format (float) +- `OutputReconstructionDataType`: Export format (short) +- `ScaleOut` and `OffsetOut`: Hounsfield unit scaling +- `Interpolate`: Interpolation method +- `CollimatorName`: Beam collimation setup + +### 3. Reconstructed Volume Data (.SCAN) + +**Files**: `[UID].[timestamp].SCAN` + +These are AVS (Application Visualization System) format files containing: + +#### Header Information +``` +# AVS wants to have the first line starting with its name + +kv=100 # X-ray tube voltage +ma=10 # Tube current +ms=10 # Exposure time +ndim=3 # 3D volume data +dim1=264 # X dimension +dim2=270 # Y dimension +dim3=270 # Z dimension +``` + +#### Data Format Specification +- `nspace=3`: 3D spatial data +- `veclen=1`: Scalar values (not vector) +- `data=xdr_short`: 16-bit integer data in XDR format +- `field=uniform`: Regular grid spacing +- `nki_compression=2`: Compression type + +#### Binary Volume Data +The remainder of the file contains the compressed 3D CBCT volume data representing the reconstructed CT numbers/Hounsfield units for each voxel. + +### 4. Machine Orientation (.SCAN.MACHINEORIENTATION) + +**Files**: `[UID].[timestamp].SCAN.MACHINEORIENTATION` + +Contains coordinate system transformation data: + +#### Transformation Matrix +- 4×4 transformation matrix in AVS format +- Maps between image coordinates and machine/IEC coordinates +- Essential for accurate patient positioning and registration +- Binary floating-point data in XDR format + +### 5. DICOM Registration Data (.RPS.dcm) + +**Files**: `[UID].RPS.dcm` (when present) + +DICOM RT Registration and Positioning Structure files containing: +- Spatial registration information +- Coordinate system relationships +- Links to planning CT and treatment planning system +- DICOM-compliant metadata for integration with treatment planning + +## Technical Context + +### Coordinate Systems +The XVI system uses IEC 61217 coordinate conventions: +- Patient coordinate system (Fixed) +- Gantry coordinate system (Rotating) +- Table coordinate system (Translating/Rotating) + +The MACHINEORIENTATION files provide the transformations between these coordinate systems at the time of imaging. + +### Reconstruction Workflow +1. **Projection Acquisition**: kV X-ray projections captured as .his files +2. **Calibration**: Flat-field and dark-field corrections applied +3. **Preprocessing**: Scatter correction, beam hardening correction +4. **Reconstruction**: Filtered back-projection or iterative algorithms +5. **Output**: 3D volume saved as .SCAN files with metadata + +### Data Integration +- **Treatment Planning**: ReferenceUID links to planning CT +- **Patient Positioning**: Transformation matrices enable registration +- **Quality Assurance**: Configuration files provide audit trail +- **Archive Storage**: Complete parameter set enables reprocessing + + +## File Size and Storage Considerations + +- **.SCAN files**: Largest files, typically 20-50 MB depending on matrix size +- **.INI files**: Small text files, typically <10 KB +- **.MACHINEORIENTATION files**: Small binary files, typically <1 KB +- **.RPS.dcm files**: Variable size DICOM files, typically 1-10 KB + +Total storage per CBCT scan: Approximately 25-60 MB including all reconstruction files. + diff --git a/Docs/GC_Elekta_Patient_Upload_Process.md b/Docs/GC_Elekta_Patient_Upload_Process.md new file mode 100644 index 0000000..33875d2 --- /dev/null +++ b/Docs/GC_Elekta_Patient_Upload_Process.md @@ -0,0 +1,129 @@ +# GC Elekta Patient Upload Process to USYD RDS/research/PRJ-LEARN + +## Purpose of document + +This document serves as the SOP for patient data transfer to LEARN from PRIME datastore and from the Elekta_fdt datastore. This is to be used in conjunction with the USYD transfer WIs which addresses connectivity and folder structure in more detail. + +Before registering the patient we need to check if dataset has the following: + +1. Height/weight information in MQ assessments (Vital Signs) during the treatment period. This requirement might be revisited at a subsequent revision. +2. Projections with target within the 180 degree frames. +3. RPS.dcm with registration and shift values available in XVIclinical folder and MQ + +## PRIME Data Transfer + +The Site Physicist of each PRIME site will move PRIME related data to [PRIME Data Store](https://genesiscare.sharepoint.com/:f:/r/sites/PRIME/Shared%20Documents/PRIME%20Data%20Store?csf=1&web=1&e=mhtCbJ). + +The PRIME trial physicist will be responsible for the rest of the process below. Only the sessions with SFOV CBCTs need to be transferred to the LEARN data base. Ignore sessions with MFOV CBCTs. + +<**to be expanded**/copy from PRIME workflow doc> + +## Elekta_fdt Data Transfer + +Open up the [Patient Data Log.xlsx](https://genesiscare.sharepoint.com/:x:/r/sites/LEARN/Shared%20Documents/General%20Channel/Patient%20Data%20Log.xlsx?d=w2ebf6abe06d542cb9e87b3c40fd1beea&csf=1&web=1&e=XmStjS) in LEARN Teams. This is a list of patients in the Elekta_FDT folder. Go to the patient folder in elekta_fdt that is currently being processed and select an unprocessed patient on the worksheet. + +elekta_fdt folder name contains the MRN. Open patient in Mosaiq. + +Scan UID of the CBCT has date and time embedded in it. One place to get it is in one of the .ini files in IMAGES\img_1.3.46.423632.337839202332931827841.8\Reconstruction + +Date and time of CBCT: +ScanUID=1.3.46.423632.33783920233217242713.224.2023-03-21165402768 + +Open patient in Mosaiq and find out what the treatment was, check if there are registration shifts have been uploaded as required into MQ. If there are shifts then go to the patient plan in Monaco. + +In the folder structure of export from XVI this scan UID it is in one of the .ini files in IMAGES\img_1.3.46.423632.337839202332931827841.8\Reconstruction + +![XVI Reconstruction folder showing INI file contents with ScanUID and date/time](images/GC_Elekta_Patient_Upload_Process/01_xvi_reconstruction_ini.png) + +Open patient in Monaco. If not in the default location, retrieve from archive (ensuring the usual precautions for retrieval is followed so it doesn't overwrite any data). Open the correct plan (using the date and reference meta-data of first fraction CBCT) and open 3D view with PTV visible. + +## Target in projections check + +Open **Contour Alignment Tool** from imageX and follow instructions in [https://github.com/Image-X-Institute/contour-alignment-tool.git](https://github.com/Image-X-Institute/contour-alignment-tool.git) to get to view the contour overlayed on projections. Use 3D view in Monaco to verify and adjust contour location against the projections. + +![Contour Alignment Tool showing PTV overlay on projection](images/GC_Elekta_Patient_Upload_Process/02_contour_alignment_tool.png) ![X-ray projection with PTV contour overlay on spine](images/GC_Elekta_Patient_Upload_Process/03_spine_ptv_projection.png) + +All the data loads fine and goes green, gets to Data processing in progress, Loading the CT happens quickly. "Generate structure volume mask" takes a long time. Looks like it's stalled but it has not. + +To assess whether the PTV is within the target, + +1. Check image is SFOV (come back MFOV images if more patients are needed) +2. Open up patient in Monaco in 3D view and at the same time have the contour alignment tool open. Use both to guide where the PTV will be on the patient anatomy. + +## Patient Details and shifts from Mosaiq Images List + +Open the excel template for Treatment Notes + +kV and mA per frame can be obtained from the .ini in the reconstruction folder. mAs ?? + +TubeMA=10.0000 TubeKV=100.0000 + +Coordinate system will be "Patient Coordinate System (Beam)". See Appendix to check if beam or anatomy. + +The image list SRO browser can be used to fill the shift in patient coordinates. + +![Mosaiq Images toolbar icon](images/GC_Elekta_Patient_Upload_Process/04_mosaiq_images_icon.png) + +The registration results applied and shifted to and from that fraction's CBCT is listed here in patient coordinates. You can highlight and copy these values to the patient details spreadsheet. + +![Mosaiq SRO shift values in patient coordinates](images/GC_Elekta_Patient_Upload_Process/05_mosaiq_sro_shifts.png) + +![Mosaiq SRO detailed registration view](images/GC_Elekta_Patient_Upload_Process/06_mosaiq_sro_detailed.png) + +## Anonymisation and Folder Sorting + +Anonymisation and folder sorting are now handled by the **LEARN Pipeline GUI** (`python -m learn_upload`). This replaces the previous MIM-based anonymisation workflow. + +The GUI wizard automates the following steps: + +1. **Folder Sort** -- copies XVI export files into the LEARN directory structure +2. **DICOM Anonymisation** -- strips patient-identifiable information from DICOM files and assigns PATxx sequential IDs +3. **PII Verification** -- scans output files for any residual patient-identifiable data +4. **CBCT Shift Report** -- extracts couch shift values from RPS registration files + +See the [GUI Walkthrough](GUI_Walkthrough.md) for the full step-by-step guide. + +## Transfer + +Instructions [LEARN data collection15May2025.pdf](https://genesiscare.sharepoint.com/:b:/r/sites/LEARN/Shared%20Documents/General%20Channel/LEARN%20data%20collection15May2025.pdf?csf=1&web=1&e=YIrMxY) + +Use the SFTP (cyberduck or WinSCP) to connect and place files in LEARN folder as per folder structure specified in the Sample_testcentre. + +![LEARN Sample_testcentre folder structure](images/GC_Elekta_Patient_Upload_Process/11_learn_folder_structure.png) + +Check the actual folder in LEARN as the recommended structure might have been updated since the time of screen shot below. + +GC is only providing CBCTs at this stage so the reconstructed CBCTs, registration objects and the projection files will be in the PATxx >> FXxx > CBCT folder. **Only include the CBCTs on which shifts were applied and treated.** + +## Appendix + +### Definitions from Mosaiq 2.83 help file + +**Images list column definitions.** + +| Parameter | Description | +|-----------|-------------| +| Sup | Displays the magnitude of the superior or inferior offset of the image in centimeters. | +| Lat | Displays the magnitude of the left or right offset of the image in centimeters. | +| Ant | Displays the magnitude of the anterior or posterior offset of the image in centimeters. | +| Mag. | Displays the offset vector magnitude. | +| Cor (B) | Displays the coronal angular correction in degrees. | +| Sag (B) | Displays the sagittal angular correction in degrees. | +| Trans (B) | Displays the transverse angular correction in degrees. | + +**Setup Offset Reference** + +Select the offset reference to be used by Image Management. Setup corrections on images are described in patient coordinates (Ant/Post, Sup/Inf, R/L). You can select Beam or Anatomy to determine the direction in which to move the beam. These selections configure the MOSAIQ system to show setup corrections direction relative to beam parameters or patient anatomy shown in the image. + +The settings are opposite of each other. If you move the patient in the inferior direction, the isocenter moves in the superior direction. + +If you select Beam, offsets show the direction the displayed isocenter/aperture must move across the displayed anatomy to align with the planned target position. If you select Anatomy, offsets show the direction the patient must move below the immobile beam. The default is Beam. + +These options configure the system to show setup corrections direction relative to beam parameters or patient anatomy viewed in the image. + +- **Beam:** The direction the current beam references (jaw/isocenter) must move to align with the target anatomy. +- **Anatomy:** The direction target anatomy shown in the image must move to align with the displayed beam references. + +Department configuration for Setup Offset Reference at GC is **Beam**. + +![Mosaiq Setup Offset Reference configuration showing Beam selected](images/GC_Elekta_Patient_Upload_Process/12_mosaiq_offset_reference.png) diff --git a/Docs/GUI_Walkthrough.md b/Docs/GUI_Walkthrough.md new file mode 100644 index 0000000..bc8a4a1 --- /dev/null +++ b/Docs/GUI_Walkthrough.md @@ -0,0 +1,213 @@ +# LEARN Pipeline GUI Walkthrough + +## Purpose + +The LEARN Pipeline GUI is a 6-step desktop wizard that automates the transfer of Elekta XVI CBCT patient data from GenesisCare exports to the USYD RDS/research/PRJ-LEARN directory structure. It replaces the manual workflow documented in the [GC Elekta Patient Upload Process](GC_Elekta_Patient_Upload_Process.md) with a guided, reproducible process covering folder sorting, DICOM anonymisation, PII verification, and CBCT shift reporting. + +## Prerequisites + +- **Python 3.10+** with the following packages installed: + ``` + pip install pydicom PyQt6 + ``` +- Access to the XVI patient export directory (e.g. `\\GC04PRBAK02\elekta_fdt\XVI_COLLECTION\processed\`) +- A writable output directory for the LEARN folder structure + +## Expected Input: XVI Export Directory + +The **Source Path** in Step 1 should point to an Elekta XVI patient export with this structure: + +``` +patient_XXXXXXXX/ + XVI Export/ <- configurable via "Images Subdirectory" field + img_/ <- one directory per acquisition session + _Frames.xml <- treatment ID, acquisition preset, kV/mA + *.his <- raw X-ray projection images + Reconstruction/ + *.INI / *.INI.XVI <- ScanUID (embeds datetime), tube kV/mA + *.SCAN <- reconstructed CBCT volume + *.SCAN.MACHINEORIENTATION <- machine orientation metadata + *.RPS.dcm <- registration DICOM (couch shifts) + img_/ ... <- additional sessions +``` + +### Optional Input: TPS Export + +If a **TPS Export Path** is provided, it should contain subdirectories exported from Monaco or the treatment planning system: + +``` +TPS Export/ + DICOM CT Images/ <- reference CT slices + DICOM RT Plan/ <- treatment plan DICOM + DICOM RT Structures/ <- structure set DICOM + DICOM RT Dose/ <- dose distribution DICOM +``` + +### Optional Input: Trajectory Logs + +If a **Trajectory Logs Dir** is provided, it should contain fraction-numbered subdirectories: + +``` +Trajectory_Logs/ + FX01/ + MarkerLocations*.txt <- KIM marker tracking data + FX02/ ... +``` + +## Launching the GUI + +```bash +python -m learn_upload +``` + +The window opens with a dark-themed interface. A sidebar on the left shows all six steps; the main panel on the right displays the active step. Completed steps get a green checkmark in the sidebar and can be clicked to review. + +--- + +## Step 1: Configuration + +The Configuration page collects all the parameters needed for the pipeline run. It is divided into four cards. + +### Patient Identity + +| Field | Description | +|-------|-------------| +| **Anonymised ID** | The PATxx identifier for this patient (e.g. `PAT01`). Must match the format `PATxx` where `xx` is a two-digit number. | +| **Site Name** | The treatment site label used as a subfolder in the output (e.g. `prostate`). | + +### Data Paths + +| Field | Description | +|-------|-------------| +| **Source Path** | Root of the XVI patient export (e.g. `Patient_258215`). Contains the `IMAGES/` or `XVI Export/` subdirectory. | +| **TPS Export Path** | Path to exported treatment planning system DICOM files (optional). | +| **Output Path** | Root output directory where the LEARN folder structure will be created. | +| **Staging Path** | Intermediate staging directory. Defaults to `output/_staging` if left blank. | +| **Images Subdirectory** | Name of the images folder within the patient export. Defaults to `XVI Export`. | +| **Centroid File** | Path to a centroid CSV file for KIM data (optional). | +| **Trajectory Logs Dir** | Directory containing trajectory log files (optional). | + +### PII Search Strings + +A comma-separated list of patient identifiers (MRN, name fragments, etc.) that will be searched for during the PII verification step. The patient's MRN is also auto-detected from the source directory name. + +### Options + +- **Dry run** -- When checked, the folder sort step previews the file operations without copying any files. + +![Configuration page showing Patient Identity, Data Paths, and PII Search Strings cards](images/gui-walkthrough/01.png) + +![Options card with dry run checkbox and "Continue to Preview" button](images/gui-walkthrough/02.png) + +Once all required fields are filled in, click **Continue to Preview** to proceed. + +--- + +## Step 2: Data Preview + +After submitting the configuration, the GUI discovers all imaging sessions within the source directory. A table displays the results with the following columns: + +| Column | Description | +|--------|-------------| +| **Type** | Session type (e.g. `cbct`, `planar`). | +| **Directory** | The `img_` directory name containing this session. | +| **Datetime** | Scan date and time extracted from the ScanUID. | +| **Treatment** | Treatment plan ID parsed from `_Frames.xml`. | +| **kV** | Tube voltage from the reconstruction INI file. | +| **mA** | Tube current from the reconstruction INI file. | +| **RPS** | Whether an RPS registration file was found for this session. | + +A stat card above the table shows the total number of discovered sessions. + +Review the table to confirm the expected sessions are present, then click **Start Folder Sort** to proceed. + +> *Screenshot to be added.* + +--- + +## Step 3: Folder Sort + +The folder sort step copies files from the XVI export into the LEARN directory structure, organising them by fraction and file type. + +During execution, a progress bar shows the current operation (e.g. `FX4/KIM-KV`) and percentage. The log output panel displays real-time messages including treatment ID discovery from `_Frames.xml` files and session-to-fraction matching. + +![Folder Sort in progress at 80%, log showing session matching and fraction assignment](images/gui-walkthrough/03.png) + +When complete, stat cards display a summary of the sorted data: + +| Stat | Description | +|------|-------------| +| **Sessions** | Total imaging sessions processed. | +| **Fractions** | Number of treatment fractions identified. | +| **.his Files** | Projection image files copied to KIM-KV folders. | +| **SCAN Files** | Reconstructed CBCT scan files copied. | +| **RPS Files** | Registration Position Storage DICOM files copied. | +| **INI Files** | Reconstruction parameter files copied. | + +![Folder Sort complete showing 30 sessions, 5 fractions, 2673 .his files, 22 SCAN files, 8 RPS files, 44 INI files](images/gui-walkthrough/04.png) + +Click **Start Anonymisation** to proceed. + +--- + +## Step 4: Anonymise + +The anonymisation step processes all DICOM, XML, and INI files in the output directory, replacing patient-identifiable information with the anonymised ID. + +A progress bar tracks the current file being processed (e.g. `CT_1859_301.dcm`) and the overall percentage. The log output shows each file as it is anonymised, including the source and destination paths. + +![Anonymise step in progress at 93%, processing DICOM files with log output](images/gui-walkthrough/05.png) + +When complete, stat cards show: + +| Stat | Description | +|------|-------------| +| **DICOM** | Number of DICOM files anonymised. | +| **XML** | Number of `_Frames.xml` files anonymised. | +| **INI** | Number of INI configuration files anonymised. | +| **TPS Imported** | Number of TPS export files imported and anonymised. | +| **Errors** | Number of files that failed to anonymise (highlighted in red if > 0). | + +![Bottom of Anonymise page showing "Run PII Verification" button](images/gui-walkthrough/06.png) + +Click **Run PII Verification** to proceed. + +--- + +## Step 5: PII Verification + +The PII verification step scans all files in the output directory for any residual patient-identifiable information matching the search strings configured in Step 1. The MRN extracted from the source directory name is automatically included. + +![PII Verification scanning in progress](images/gui-walkthrough/07.png) + +The result is displayed as either **PASS** (green banner) or **FAIL** (red banner), along with stat cards showing the number of files scanned and the number of findings. + +![PII Verification FAIL result: 10,291 files scanned, 3 findings](images/gui-walkthrough/08.png) + +If findings are detected, a table lists each one with: + +| Column | Description | +|--------|-------------| +| **File** | Relative path to the file containing PII. | +| **Location** | Where in the file the match was found (e.g. DICOM tag name, XML element). | +| **Matched** | The PII string that was matched. | + +If the result is FAIL, you should investigate and resolve the findings before uploading to LEARN. Common causes include PII embedded in private DICOM tags or XML metadata that was not covered by the anonymisation rules. + +![Bottom of PII Verification page showing "Generate CBCT Report" button](images/gui-walkthrough/09.png) + +Click **Generate CBCT Report** to proceed. + +--- + +## Step 6: CBCT Shift Report + +The final step generates a markdown report of CBCT couch shifts extracted from the RPS registration files. The report is displayed in a text viewer and automatically saved to the `Patient Files/PATxx/` folder as `cbct_shift_report.md`. + +The report includes per-fraction registration shifts in patient coordinates, which can be cross-referenced with Mosaiq image list values for quality assurance. + +If no RPS files are found in the output directory, an error message is shown instead. + +Once the report is generated, click **New Patient** to reset the wizard and start processing the next patient. + +> *Screenshot to be added.* diff --git a/Docs/LEARN_Upload_Automation_Plan.md b/Docs/LEARN_Upload_Automation_Plan.md new file mode 100644 index 0000000..23876aa --- /dev/null +++ b/Docs/LEARN_Upload_Automation_Plan.md @@ -0,0 +1,337 @@ +# LEARN Upload Automation Plan + +## Overview + +This document describes the planned Python automation for the GC Elekta Patient Upload Process to USYD LEARN. The goal is to replace manual steps (MIM anonymisation, manual folder creation, manual data entry) with Python scripts while retaining user prompts for steps that require clinical judgement. + +## Current Manual Bottlenecks + +| Step | Current Method | Automation Target | +|------|---------------|-------------------| +| DICOM anonymisation | Export to MIM, right-click anonymise | Python script using `pydicom` | +| Folder structure creation | Manual folder creation + file copying | Python script maps XVI -> LEARN structure | +| Treatment Notes entry | Manual data entry from .ini files and Mosaiq | Auto-populate xlsx from parsed XVI data | +| Shift extraction | Copy from Mosaiq SRO browser | Auto-extract from RPS.dcm via `ElektaRPSExtractor` | +| Process coordination | Follow SOP document manually | Interactive CLI wrapper guides user through stages | + +## Package Structure + +``` +learn_upload/ + __init__.py + config.py # Shared configuration, paths, constants + utils.py # INI parsing, XML parsing utilities + anonymise_dicom.py # DICOM anonymisation (replaces MIM) + folder_sort.py # XVI export -> LEARN structure mapping + treatment_notes.py # Treatment_Notes.xlsx generation + upload_workflow.py # Interactive wrapper CLI (main entry point) +``` + +## Dependencies + +| Package | Status | Purpose | +|---------|--------|---------| +| `pydicom` 3.0.1+ | Already installed | DICOM file reading and anonymisation | +| `numpy` | Already installed | Matrix operations for shift extraction | +| `openpyxl` 3.1+ | **New - to install** | Treatment_Notes.xlsx generation | + +## Script 1: `anonymise_dicom.py` + +### Purpose +Replace the manual MIM-based anonymisation with Python. Only TPS data (CT_SET/, DICOM_PLAN/) needs anonymisation -- projection and CBCT files in elekta_fdt are already anonymised. + +### Anonymised ID Format +**PATxx** (e.g., PAT01, PAT02) -- matching the LEARN folder structure convention. + +### DICOM Tags to Modify + +| Tag | Name | Action | +|-----|------|--------| +| (0010,0010) | PatientName | Replace with PATxx | +| (0010,0020) | PatientID | Replace with PATxx | +| (0010,0030) | PatientBirthDate | Clear | +| (0008,0050) | AccessionNumber | Clear | +| (0008,0080) | InstitutionName | Clear | +| (0008,0081) | InstitutionAddress | Clear | +| (0008,0090) | ReferringPhysicianName | Clear | +| (0008,1048) | PhysiciansOfRecord | Clear | +| (0008,1070) | OperatorsName | Clear | +| (0020,0010) | StudyID | Replace with PATxx | + +### Tags to Preserve +- PatientSex, PatientAge, PatientSize, PatientWeight (needed for research) +- StudyDescription (contains treatment info) +- **All DICOM UIDs** (preserves referential integrity between CT, structure set, and plan) + +### Class Design + +```python +class DicomAnonymiser: + def __init__(self, patient_dir: Path, anon_id: str, output_dir: Path): + """ + patient_dir: Path to patient_XXXXXXXX directory + anon_id: Sequential ID (e.g., "PAT01") + output_dir: Staging directory for anonymised output + """ + + def anonymise_file(self, dcm_path: Path) -> Path + def anonymise_ct_set(self) -> list[Path] + def anonymise_plan(self) -> list[Path] + def anonymise_all(self) -> dict # Returns summary +``` + +### Reuse +- `pydicom` patterns from existing `scripts/extract_elekta_rps_matrices.py` +- Never modifies source files -- writes to staging directory + +## Script 2: `folder_sort.py` + +### Purpose +Map the flat XVI export structure into the hierarchical LEARN target structure automatically. + +### Source -> Target Mapping + +| Source (XVI) | Target (LEARN) | +|-------------|----------------| +| `patient_XXXXXXXX/IMAGES/img_[UID]/*.his` | `Patient Images/PATxx/FXn/KIM-KV/` | +| `patient_XXXXXXXX/IMAGES/img_[UID]/Reconstruction/*.SCAN` | `Patient Images/PATxx/FXn/CBCT/CBCTm/Reconstructed CBCT/` | +| `patient_XXXXXXXX/IMAGES/img_[UID]/Reconstruction/*.SCAN.MACHINEORIENTATION` | `Patient Images/PATxx/FXn/CBCT/CBCTm/Reconstructed CBCT/` | +| `patient_XXXXXXXX/IMAGES/img_[UID]/Reconstruction/*.RPS.dcm` | `Patient Images/PATxx/FXn/CBCT/CBCTm/Registration file/` | +| `patient_XXXXXXXX/CT_SET/*.DCM` (anonymised) | `Patient Plans/PATxx/CT/` | +| `patient_XXXXXXXX/DICOM_PLAN/*.DCM` (anonymised) | `Patient Plans/PATxx/Plan/` | + +### Target LEARN Folder Structure + +``` +[SiteName]/ + Patient Files/ + PATxx/ + Treatment_Notes.xlsx + Patient Images/ + PATxx/ + FX0/ + CBCT/ + CBCT1/ + CBCT Projections/ + CDOG/ + IPS/ + Reconstructed CBCT/ + Registration file/ + CBCT2/ (if multiple CBCTs same fraction) + KIM-KV/ (.his projection files go here) + IFI/ + FX1/ ... FXn/ + Patient Plans/ + PATxx/ + CT/ (anonymised CT_SET DICOM) + Plan/ (anonymised DICOM_PLAN) + Ground Truth/PATxx/ + Patient Measured Motions/ + RPM/ + Trajectory Logs/ +``` + +### Fraction Assignment Logic + +1. Parse each `img_*/Reconstruction/*.INI` for ScanUID datetime (e.g., `...2023-03-21165402768`) +2. Sort all sessions chronologically +3. Assign FX0, FX1, FX2... by date order +4. Same-day CBCTs become CBCT1, CBCT2 within same FXn +5. **Filter:** Only include CBCTs with RPS.dcm and non-zero applied shifts (per SOP requirement) + +### Session Data Model + +```python +@dataclass +class CBCTSession: + img_dir: Path + scan_uid: str + scan_datetime: datetime + treatment_id: str # From _Frames.xml + tube_kv: float # From INI TubeKV + tube_ma: float # From INI TubeMA + has_rps: bool + rps_path: Optional[Path] + couch_shifts: Optional[dict] # From ElektaRPSExtractor + ini_path: Path +``` + +### Class Design + +```python +class LearnFolderMapper: + def __init__(self, patient_dir: Path, anon_id: str, site_name: str, + output_base: Path) + + def discover_cbct_sessions(self) -> list[CBCTSession] + def assign_fractions(self, sessions: list) -> dict[str, list] + def filter_treated_sessions(self, sessions: list) -> list + def create_directory_structure(self) -> Path + def copy_projections(self, session, fx_path: Path, cbct_num: int) + def copy_reconstructions(self, session, fx_path: Path, cbct_num: int) + def copy_registration(self, session, fx_path: Path, cbct_num: int) + def copy_anonymised_plans(self, anon_ct_dir: Path, anon_plan_dir: Path) + def execute(self, dry_run: bool = False) -> dict +``` + +### Reuse +- `scripts/elektafdt_crawler.py` -- patient directory traversal and `_Frames.xml` parsing patterns +- `scripts/extract_elekta_rps_matrices.py` -- `ElektaRPSExtractor` for shift extraction from RPS.dcm + +## Script 3: `treatment_notes.py` + +### Purpose +Auto-generate Treatment_Notes.xlsx pre-filled with data extractable from XVI exports. + +### Auto-Filled Fields + +| Field | Source | +|-------|--------| +| RedCap ID | PATxx (user input) | +| Image Collected | "CBCTs" (constant) | +| Linac Type | "Elekta VersaHD" (constant) | +| Imager Position (SDD) | "150cm" (constant) | +| Couch Type | "Precise Table" (constant) | +| Coordinate System | "Patient Coordinate System (Beam)" (constant per SOP) | +| kV | From INI `TubeKV` | +| mA | From INI `TubeMA` | +| Per-fraction Date | Parsed from ScanUID datetime | +| Per-fraction Shifts (Sup/Lat/Ant) | From RPS.dcm via ElektaRPSExtractor | +| Per-fraction Rotations (Cor/Sag/Trans) | From RPS.dcm rotation values | + +### Manual Fields (prompted by wrapper) + +| Field | Notes | +|-------|-------| +| Height/Weight | From Mosaiq assessments only | +| Marker Length and Type | If applicable | +| CDOG version | If applicable | +| mAs confirmation | Verify if TubeMA is mA or mAs | + +### Package +`openpyxl` for .xlsx generation with formatting matching the existing Treatment_Notes.xlsx templates in `Docs/Prostate/Patient Files/PATxx/`. + +## Script 4: `upload_workflow.py` (Interactive Wrapper) + +### Purpose +Guide the user through the complete upload process with an interactive CLI. Automates what can be automated, prompts when manual clinical judgement is needed. + +### 8-Stage Workflow + +``` +STAGE 1: PATIENT SELECTION + [AUTO] Scan elekta_fdt base directory for patient_* folders + [AUTO] Display list with plan names (reuses elektafdt_crawler logic) + [PROMPT] "Enter patient folder name (e.g., patient_22002761): " + [PROMPT] "Enter sequential anonymised ID (e.g., PAT01): " + [PROMPT] "Enter anatomical site name (e.g., Prostate): " + +STAGE 2: DATA DISCOVERY + [AUTO] Enumerate all img_* directories under IMAGES/ + [AUTO] Parse INI files for scan dates, kV, mA + [AUTO] Parse _Frames.xml for treatment plan name + [AUTO] Check for RPS.dcm presence + [AUTO] Extract registration shifts from RPS.dcm + [DISPLAY] Summary table of all CBCT sessions with dates, shifts, plan names + +STAGE 3: MANUAL VERIFICATION CHECKPOINTS + [PROMPT] "Have you verified height/weight in Mosaiq assessments? (y/n)" + [PROMPT] "Have you opened the plan in Monaco and verified PTV in 3D view? (y/n)" + [PROMPT] "Have you run Contour Alignment Tool and confirmed target in >=180 deg? (y/n)" + [PROMPT] "Is this SFOV CBCT? (y/n)" + +STAGE 4: FRACTION ASSIGNMENT + [AUTO] Sort sessions chronologically, assign FX0, FX1... + [AUTO] Group same-day CBCTs as CBCT1, CBCT2 + [DISPLAY] Proposed fraction assignment table + [PROMPT] "Accept fraction assignment? (y/n/edit)" + +STAGE 5: ANONYMISATION + [AUTO] Run DicomAnonymiser on CT_SET/ and DICOM_PLAN/ + [DISPLAY] Summary: "Anonymised X CT files and Y plan files with ID: PATxx" + [PROMPT] "Verify anonymisation. Continue? (y/n)" + +STAGE 6: FOLDER STRUCTURE CREATION + [AUTO] Create LEARN directory structure + [AUTO] Copy .his files to KIM-KV/ + [AUTO] Copy .SCAN files to Reconstructed CBCT/ + [AUTO] Copy .RPS.dcm to Registration file/ + [AUTO] Copy anonymised CT/Plan to Patient Plans/ + [DISPLAY] File manifest with counts and sizes + +STAGE 7: TREATMENT NOTES GENERATION + [AUTO] Generate Treatment_Notes.xlsx pre-filled with extracted data + [PROMPT] "Enter mAs value (or Enter if same as mA): " + [PROMPT] "Marker Length and Type (or N/A): " + [PROMPT] "CDOG version (or N/A): " + [AUTO] Save to Patient Files/PATxx/Treatment_Notes.xlsx + +STAGE 8: TRANSFER READINESS CHECK + [AUTO] Validate directory structure against LEARN template + [AUTO] Count files per fraction, verify completeness + [AUTO] Spot-check anonymised files for residual PHI + [DISPLAY] Final summary with total size and file counts + [PROMPT] "Ready for SFTP transfer. Use Cyberduck/WinSCP to upload." +``` + +## Data Extraction: Automatic vs Manual + +| Data Point | Auto? | Source | +|------------|-------|--------| +| Patient MRN | Yes | Folder name `patient_XXXXXXXX` | +| Treatment plan name | Yes | `_Frames.xml` `` | +| Scan date/time | Yes | INI `ScanUID` (embedded datetime) | +| kV | Yes | INI `TubeKV=` | +| mA | Yes | INI `TubeMA=` | +| Registration shifts (Sup/Lat/Ant) | Yes | RPS.dcm via ElektaRPSExtractor | +| Angular corrections (Cor/Sag/Trans) | Yes | RPS.dcm rotation values | +| Fraction number | Derived | Chronological sort of scan dates | +| CBCT number within fraction | Derived | Same-day grouping | +| FOV type (SFOV/MFOV) | Partial | Could check INI.XVI CollimatorName | +| Sequential anonymised ID | **Manual** | User assigns from Patient Data Log | +| Anatomical site name | **Manual** | User determines from plan name | +| Height/weight | **Manual** | Mosaiq assessments only | +| Target in projections | **Manual** | Visual check with Contour Alignment Tool | +| Mosaiq shift verification | **Manual** | Cross-check with Mosaiq SRO display | +| Plan verified in Monaco | **Manual** | User opens Monaco 3D view | + +## Implementation Phases + +### Phase 1: Foundation +1. Create `learn_upload/` package with `__init__.py`, `config.py`, `utils.py` +2. Implement INI parsing utilities (generalise from existing scripts) +3. Write unit tests for INI parsing + +### Phase 2: Anonymisation +4. Implement `anonymise_dicom.py` with `DicomAnonymiser` +5. Test against sample DICOM files +6. Verify UID integrity in anonymised output + +### Phase 3: Folder Mapping +7. Implement `folder_sort.py` with `LearnFolderMapper` +8. Implement fraction assignment logic +9. Add dry-run mode +10. Validate output against `Docs/Prostate/` template + +### Phase 4: Treatment Notes +11. Implement `treatment_notes.py` +12. Match formatting to existing Treatment_Notes.xlsx templates + +### Phase 5: Wrapper Integration +13. Implement `upload_workflow.py` with all 8 stages +14. End-to-end testing with real patient directory + +## Technical Notes + +- All scripts use `pathlib.Path` for Windows UNC and mapped drive compatibility +- Source data is never modified -- all output goes to a staging directory +- Dry-run mode available for folder sorting (preview without copying) +- Large .his file copies use `shutil.copy2` with progress reporting +- Error handling follows existing patterns: graceful degradation for missing files/malformed data + +## Existing Code to Reuse + +| File | What to Reuse | +|------|--------------| +| `scripts/elektafdt_crawler.py` | Patient directory traversal, `_Frames.xml` parsing, CSV output | +| `scripts/extract_elekta_rps_matrices.py` | `ElektaRPSExtractor` class for RPS.dcm shift extraction, ZIP-embedded INI parsing | diff --git a/Docs/Screenshots/Applied-sro-dicom-conformance-pg60.jpeg b/Docs/Screenshots/Applied-sro-dicom-conformance-pg60.jpeg new file mode 100644 index 0000000..b158883 Binary files /dev/null and b/Docs/Screenshots/Applied-sro-dicom-conformance-pg60.jpeg differ diff --git a/Docs/Screenshots/Applied-sro-dicom-conformance-pg61.jpeg b/Docs/Screenshots/Applied-sro-dicom-conformance-pg61.jpeg new file mode 100644 index 0000000..2f210c9 Binary files /dev/null and b/Docs/Screenshots/Applied-sro-dicom-conformance-pg61.jpeg differ diff --git a/Docs/Screenshots/Applied-sro-dicom-conformance-pg62.jpeg b/Docs/Screenshots/Applied-sro-dicom-conformance-pg62.jpeg new file mode 100644 index 0000000..9c71dcd Binary files /dev/null and b/Docs/Screenshots/Applied-sro-dicom-conformance-pg62.jpeg differ diff --git a/Docs/XVI export instructions.docx b/Docs/XVI export instructions.docx new file mode 100644 index 0000000..5ae6b6a Binary files /dev/null and b/Docs/XVI export instructions.docx differ diff --git a/Docs/elekta_rps_format_documentation.md b/Docs/elekta_rps_format_documentation.md new file mode 100644 index 0000000..07e2cab --- /dev/null +++ b/Docs/elekta_rps_format_documentation.md @@ -0,0 +1,234 @@ +# Elekta XVI RPS DICOM File Format - Technical Summary + +## Overview +Elekta XVI exports Registration Position Storage (RPS) files as DICOM with modality "REG", but these are NOT standard DICOM Spatial Registration Objects. Instead, Elekta uses a proprietary format with embedded data. + +## File Structure + +### DICOM Header +- **Modality**: REG +- **SOP Class UID**: 1.2.840.10008.5.1.4.1.1.66 (Raw Data Storage) +- **Series Description**: RPS +- **Manufacturer**: ELEKTA + +### Private Tags +Elekta stores the actual registration data in private DICOM tags: + +- **(0019,0010)**: "ELEKTA" (private creator) +- **(0021,0010)**: "Elekta: zip file" +- **(0021,0011)**: "Elekta: RPS data" +- **(0021,103A)**: The actual ZIP file data (VR=UN) + +### Embedded ZIP Archive +The private tag (0021,103A) contains a ZIP file with: + +1. **[UID].INI.XVI** - Main registration data file (contains matrices!) +2. **[UID].INI** - Basic settings and display parameters +3. **[UID].MASK.[date]** - Binary mask data + +## Registration Data Format + +The .INI.XVI file contains the registration matrices in plain text format: + +``` +OnlineToRefTransformUnMatched = [16 space-separated float values] +OnlineToRefTransformCorrection = [16 space-separated float values] +``` + +### Matrix Format +- 16 values representing a 4×4 homogeneous transformation matrix +- **Row-major order** storage +- Values are in **centimeters** for translation +- Rotation matrix component uses standard 3×3 rotation representation + +### Matrix Structure +``` +[ R11 R12 R13 Tx ] +[ R21 R22 R23 Ty ] +[ R31 R32 R33 Tz ] +[ 0 0 0 1 ] +``` + +Where: +- R = 3×3 rotation matrix +- T = translation vector (cm) + +## Coordinate Systems + +### IEC Convention +From the INI file: +- `IECAngleConvention=1` +- `IECLinearConvention=2` + +### Typical XVI Matrix +The matrices typically show a coordinate system transformation. For example: +``` +[[ 0 0 -1 0 ] + [ 0 1 0 0 ] + [ 1 0 0 0 ] + [Tx Ty Tz 1 ]] +``` + +This represents: +- 90° rotation about Y-axis (X becomes -Z, Z becomes X) +- Translation by (Tx, Ty, Tz) + +## Alignment Information Available + +### 1. Clipbox Alignment +Initial bone-based registration (translation + rotation): +- Lateral, Longitudinal, Vertical (cm) +- Rotation, Pitch, Roll (degrees) + +### 2. Mask Alignment +Refined grey-value registration within mask (translation + rotation): +- Lateral, Longitudinal, Vertical (cm) +- Rotation, Pitch, Roll (degrees) + +### 3. Couch Shifts +Applied corrections: +- Lateral shift (cm) +- Longitudinal shift (cm) +- Vertical shift (cm) + +### 4. Transformation Matrices +- **Unmatched**: Fixed coordinate-system rotation (XVI to patient/IEC coordinates) plus isocenter offset. This matrix is **identical across all fractions** for a given patient — it contains no patient-specific correction. +- **Correction**: Same coordinate rotation as Unmatched, plus the patient-specific translational and rotational corrections from the registration. The relative rotation `R_correction @ R_unmatched^(-1)` yields a near-identity matrix encoding the small correction angles. + +In practice, the Clipbox INI values (`Align.clip1`) provide the corrections directly — matrix decomposition is not needed for extracting shifts and rotations. + +### 5. INI Field Relationships + +The INI `[ALIGNMENT]` section contains several related representations of the same correction: + +```ini +Align.clip1=-0.21, 0.05, -0.28, 0.4, 0.8, 359.8 +# lat long vert rot pitch roll (6-DOF) + +Align.correction=-0.21, 0.05, -0.28, 0.4, 0.8, 359.8 +# identical to Align.clip1 when Clipbox is the correction source + +CouchShiftLat=0.21 # = negated Clipbox lateral +CouchShiftLong=-0.05 # = negated Clipbox longitudinal +CouchShiftHeight=0.28 # = negated Clipbox vertical +CouchPitch=- # unavailable (couch rotation not recorded separately) +CouchRoll=- +CouchYaw=- +``` + +The CouchShift values are the **negated** Clipbox translations — they represent the physical couch movement direction (opposite to the image-space correction). Rotation values above 180 degrees use a 360-degree wrapping convention (e.g. 359.8 means -0.2 degrees). + +## Mapping to Mosaiq CBCT Shift Records + +The following mapping was validated against 6 CBCT registration sessions (PAT01, 4 fractions) by comparing RPS Clipbox values to Mosaiq image-list shift records. All non-zero values matched exactly. + +### Translations (cm) + +| Mosaiq Field | RPS Clipbox Field | Sign | CouchShift Field | Sign | +|---|---|---|---|---| +| Sup/Inf | `longitudinal` | same | `CouchShiftLong` | negated | +| Lft/Rht | `lateral` | same | `CouchShiftLat` | negated | +| Ant/Pos | `vertical` | same | `CouchShiftHeight` | negated | + +Mosaiq sign convention: Sup=+, Inf=-, Lft=+, Rht=-, Ant=+, Pos=-. + +### Rotations (degrees) + +| Mosaiq Field | RPS Clipbox Field | Sign | +|---|---|---| +| Cor(B) | `roll` | same | +| Sag(B) | `rotation` | same | +| Trans(B) | `pitch` | **negated** | + +Mosaiq sign convention: CW=+, CCW=-. Clipbox angles >180 must be unwrapped (subtract 360) before comparison. + +**The rotation axes are permuted** — Mosaiq's Coronal maps to Clipbox Roll, Sagittal maps to Clipbox Rotation, and Transverse maps to negated Clipbox Pitch. This was confirmed across all 4 non-zero rotation cases with exact agreement. + +### Verification Script + +The mapping was determined using `cbct-shifts/compare_rps_mosaiq.py`, which parses the Mosaiq TSV log (`cbct-shifts/CBCT-shifts-from-mosaiq.txt`), extracts RPS data from all fraction CBCT files, matches records by date/time, and prints a side-by-side comparison. + +## Safety Considerations + +⚠️ **CRITICAL SAFETY NOTES:** + +1. **Coordinate System Validation** + - ALWAYS verify the coordinate system conventions when importing these matrices into other systems + - XVI uses IEC conventions which may differ from your TPS + - The rotation matrix and translation vector must be interpreted correctly + +2. **Matrix Application Order** + - Verify whether matrices are applied as pre-multiplication or post-multiplication + - Understand the reference frames: Online→Reference vs Reference→Online + +3. **Units** + - Translation values are in CENTIMETERS + - Rotations are in DEGREES (in alignment parameters) + - Verify unit consistency when exporting to other systems + +4. **Clinical Validation** + - Any automated extraction and use of these matrices for treatment must be validated + - Verify end-to-end with known test cases + - Compare against XVI display for several cases + +5. **Version Compatibility** + - This format is for XVI 5.x (seen in your file: "NKI-XVI 5.103") + - Different XVI versions may have format variations + - Always test with your specific XVI version + +## Extraction Methods + +### Method 1: Python with pydicom (Recommended) +```python +from extract_elekta_rps_matrices import ElektaRPSExtractor # in scripts/ + +extractor = ElektaRPSExtractor("rps_file.dcm") +extractor.extract_all() +correction_matrix = extractor.get_correction_matrix(0) +``` + +### Method 2: Manual Extraction +1. Read DICOM file +2. Extract private tag (0021,103A) +3. Unzip the embedded data +4. Parse .INI.XVI file for matrix strings +5. Convert 16-element array to 4×4 matrix + +## Common Use Cases + +1. **QA Verification**: Compare XVI registrations with independent calculations +2. **Data Analysis**: Analyze registration patterns over time +3. **Export to TPS**: Import XVI registrations into treatment planning systems +4. **Research**: Study registration accuracy and reproducibility + +## Example Output from Your File + +**Patient**: Anonymised, DOB (01-01-1999)) +**Treatment**: LtLungSBRT +**Alignment Date**: 2023-10-10 16:17:49 +**Protocol**: Clipbox → Mask + +**Couch Shifts Applied**: +- Lateral: -0.12 cm +- Longitudinal: 0.54 cm +- Vertical: 0.12 cm + +**Correction Matrix**: +``` +[[ 0. 0. -1. 0. ] + [ 0. 1. 0. 0. ] + [ 1. 0. 0. 0. ] + [10.8 4.92 5.21 1. ]] +``` + +## References & Resources + +- Elekta XVI User Manual (version-specific) +- DICOM Standard PS3.3 (for private tag conventions) +- IEC 61217: Radiotherapy equipment - Coordinates, movements and scales + +--- + +**Document Version**: 1.1 +**Date**: 2026-02-24 +**Tool**: scripts/extract_elekta_rps_matrices.py, cbct-shifts/compare_rps_mosaiq.py diff --git a/Docs/elekta_xvi_sro_experimental_validation.md b/Docs/elekta_xvi_sro_experimental_validation.md new file mode 100644 index 0000000..2154e1c --- /dev/null +++ b/Docs/elekta_xvi_sro_experimental_validation.md @@ -0,0 +1,20 @@ +#Validation via yellow submarine experimentation on Amaroo + + + +Aim: Validate the registration files and what it does. Make sure the orientations are correct + +Aim2: Check how this maps on to the Mosaiq image list shifts + +#Methods: Yellow submarine on Amaroo. +CBCT and align +CDo some shifts +etc ask AM for the measurement doc we did. + +#Results: +shifts according to SRO in xvi exported dicom file +shifts according to lasers +shifts according to mosaiq + + + diff --git a/Docs/images/GC_Elekta_Patient_Upload_Process/01_xvi_reconstruction_ini.png b/Docs/images/GC_Elekta_Patient_Upload_Process/01_xvi_reconstruction_ini.png new file mode 100644 index 0000000..ca3c59f Binary files /dev/null and b/Docs/images/GC_Elekta_Patient_Upload_Process/01_xvi_reconstruction_ini.png differ diff --git a/Docs/images/GC_Elekta_Patient_Upload_Process/02_contour_alignment_tool.png b/Docs/images/GC_Elekta_Patient_Upload_Process/02_contour_alignment_tool.png new file mode 100644 index 0000000..b361227 Binary files /dev/null and b/Docs/images/GC_Elekta_Patient_Upload_Process/02_contour_alignment_tool.png differ diff --git a/Docs/images/GC_Elekta_Patient_Upload_Process/03_spine_ptv_projection.png b/Docs/images/GC_Elekta_Patient_Upload_Process/03_spine_ptv_projection.png new file mode 100644 index 0000000..550ec7c Binary files /dev/null and b/Docs/images/GC_Elekta_Patient_Upload_Process/03_spine_ptv_projection.png differ diff --git a/Docs/images/GC_Elekta_Patient_Upload_Process/04_mosaiq_images_icon.png b/Docs/images/GC_Elekta_Patient_Upload_Process/04_mosaiq_images_icon.png new file mode 100644 index 0000000..cbd98d9 Binary files /dev/null and b/Docs/images/GC_Elekta_Patient_Upload_Process/04_mosaiq_images_icon.png differ diff --git a/Docs/images/GC_Elekta_Patient_Upload_Process/05_mosaiq_sro_shifts.png b/Docs/images/GC_Elekta_Patient_Upload_Process/05_mosaiq_sro_shifts.png new file mode 100644 index 0000000..9c9ef05 Binary files /dev/null and b/Docs/images/GC_Elekta_Patient_Upload_Process/05_mosaiq_sro_shifts.png differ diff --git a/Docs/images/GC_Elekta_Patient_Upload_Process/06_mosaiq_sro_detailed.png b/Docs/images/GC_Elekta_Patient_Upload_Process/06_mosaiq_sro_detailed.png new file mode 100644 index 0000000..51f43b7 Binary files /dev/null and b/Docs/images/GC_Elekta_Patient_Upload_Process/06_mosaiq_sro_detailed.png differ diff --git a/Docs/images/GC_Elekta_Patient_Upload_Process/11_learn_folder_structure.png b/Docs/images/GC_Elekta_Patient_Upload_Process/11_learn_folder_structure.png new file mode 100644 index 0000000..cb94fb3 Binary files /dev/null and b/Docs/images/GC_Elekta_Patient_Upload_Process/11_learn_folder_structure.png differ diff --git a/Docs/images/GC_Elekta_Patient_Upload_Process/12_mosaiq_offset_reference.png b/Docs/images/GC_Elekta_Patient_Upload_Process/12_mosaiq_offset_reference.png new file mode 100644 index 0000000..ca8f164 Binary files /dev/null and b/Docs/images/GC_Elekta_Patient_Upload_Process/12_mosaiq_offset_reference.png differ diff --git a/Docs/images/gui-walkthrough/01.png b/Docs/images/gui-walkthrough/01.png new file mode 100644 index 0000000..4235880 Binary files /dev/null and b/Docs/images/gui-walkthrough/01.png differ diff --git a/Docs/images/gui-walkthrough/02.png b/Docs/images/gui-walkthrough/02.png new file mode 100644 index 0000000..af673d0 Binary files /dev/null and b/Docs/images/gui-walkthrough/02.png differ diff --git a/Docs/images/gui-walkthrough/03.png b/Docs/images/gui-walkthrough/03.png new file mode 100644 index 0000000..3297d70 Binary files /dev/null and b/Docs/images/gui-walkthrough/03.png differ diff --git a/Docs/images/gui-walkthrough/04.png b/Docs/images/gui-walkthrough/04.png new file mode 100644 index 0000000..94075ec Binary files /dev/null and b/Docs/images/gui-walkthrough/04.png differ diff --git a/Docs/images/gui-walkthrough/05.png b/Docs/images/gui-walkthrough/05.png new file mode 100644 index 0000000..77141bc Binary files /dev/null and b/Docs/images/gui-walkthrough/05.png differ diff --git a/Docs/images/gui-walkthrough/06.png b/Docs/images/gui-walkthrough/06.png new file mode 100644 index 0000000..c039c2f Binary files /dev/null and b/Docs/images/gui-walkthrough/06.png differ diff --git a/Docs/images/gui-walkthrough/07.png b/Docs/images/gui-walkthrough/07.png new file mode 100644 index 0000000..f0d207a Binary files /dev/null and b/Docs/images/gui-walkthrough/07.png differ diff --git a/Docs/images/gui-walkthrough/08.png b/Docs/images/gui-walkthrough/08.png new file mode 100644 index 0000000..55d1e2e Binary files /dev/null and b/Docs/images/gui-walkthrough/08.png differ diff --git a/Docs/images/gui-walkthrough/09.png b/Docs/images/gui-walkthrough/09.png new file mode 100644 index 0000000..48e2c60 Binary files /dev/null and b/Docs/images/gui-walkthrough/09.png differ diff --git a/README.md b/README.md index ae82b37..a700223 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,324 @@ This repo contains the code used in analyzing the SPARK data. In the 'Analysis code' folder, the codes are used extract different data properties. The SPARK Triangulation code is used to perform kV/MV triangulation for post-treatment verification of KIM tracking accuracy. -In Data anonymisation folder, the codes were written to anonymize the SPARK data follwoing TCIA recommendation. +In Data anonymisation folder, the codes were written to anonymize the SPARK data follwoing TCIA recommendation. For details please check the README in each folder. + +--- + +# learn-crawler + +**Author:** Kaan + +Automation tools for the LEARN data transfer pipeline -- transferring Elekta XVI CBCT patient data from GC (GenesisCare) to the USYD RDS research drive. Replaces manual anonymisation, folder sorting, and PII verification with a guided desktop workflow. + +## Download + +Grab the latest **`learn_upload.exe`** from [GitHub Releases](https://github.com/kkaan/learn-crawler/releases/latest). No Python installation required -- just download and double-click. + +### Important: Use Local Drives + +For best performance, **copy your XVI export data to a local drive** (e.g. `E:\` or `C:\`) before running the tool. XVI exports contain thousands of small `.his` projection files -- reading these over a network share is significantly slower than working from a local disk. + +1. Copy the `patient_XXXXXXXX/` folder from the network to a local drive +2. Run `learn_upload.exe` +3. Point the source path to your local copy +4. Set the output path to another local directory (e.g. `E:\LEARN_OUTPUT`) +5. Once processing is complete, copy the output to the RDS research drive + +### What the GUI Does + +The wizard walks you through 6 steps: + +1. **Configuration** -- set paths, anonymised ID (PATxx), and PII search strings +2. **Data Preview** -- discover XVI sessions and preview fraction assignments +3. **Folder Sort** -- copy files into the LEARN directory structure +4. **Anonymise** -- run DICOM anonymisation with per-file progress +5. **PII Verification** -- scan output for residual patient data +6. **CBCT Shift Report** -- generate a markdown report of CBCT registration shifts + +See the **[GUI Walkthrough](Docs/GUI_Walkthrough.md)** for a step-by-step guide with screenshots. + +## Developer Setup + +```bash +# Clone the repository +git clone https://github.com/kkaan/learn-crawler.git +cd learn-crawler + +# Install dependencies +pip install -r requirements-dev.txt +pip install PyQt6 # for GUI development + +# Launch the GUI from source +python -m learn_upload + +# Run tests +python -m pytest tests/ -v + +# Run linter +ruff check learn_upload/ tests/ scripts/ +``` + +## Repository Layout + +| Directory | Description | +|-----------|-------------| +| `learn_upload/` | Core Python package -- anonymisation, folder sorting, PII verification, GUI | +| `cbct-shifts/` | CBCT shift analysis scripts (Mosaiq vs RPS comparison, patient reports) | +| `scripts/` | Standalone CLI tools (RPS matrix extraction, DICOM tag reader, XVI crawler) | +| `examples/` | Pipeline usage examples | +| `tests/` | pytest test suite for `learn_upload` | +| `Docs/` | SOP documentation, automation plan, format specs | +| `Data/` | Sample/reference data files | + +## `learn_upload` Package + +The `learn_upload/` package automates the LEARN data transfer pipeline -- transferring Elekta XVI CBCT patient data from GC (GenesisCare) to the USYD RDS research drive, replacing manual steps with Python scripts. + +### Modules + +| Module | Purpose | +|--------|---------| +| `config.py` | Shared configuration, paths, DICOM tag lists, logging setup | +| `utils.py` | INI parsing, XML parsing, ScanUID datetime extraction, couch shift parsing | +| `anonymise_dicom.py` | DICOM anonymisation (replaces manual MIM workflow) | +| `folder_sort.py` | XVI export to LEARN directory structure mapping and file copying | +| `verify_pii.py` | Post-anonymisation scan for residual patient-identifiable data | +| `gui_qt.py` | PyQt6 desktop GUI wrapping all pipeline steps | + +### Using the Python API (no GUI) + +#### DICOM Anonymisation + +```python +from pathlib import Path +from learn_upload.anonymise_dicom import DicomAnonymiser + +anonymiser = DicomAnonymiser( + patient_dir=Path(r"E:\XVI_COLLECTION\processed\20230403_Flinders\patient_12345678"), + anon_id="PAT01", + output_dir=Path(r"E:\staging\patient_12345678"), +) + +summary = anonymiser.anonymise_all() +print(summary) +# {'ct_count': 182, 'plan_count': 1, 'anon_id': 'PAT01'} +``` + +#### Folder Mapping and File Sorting + +```python +from pathlib import Path +from learn_upload.folder_sort import LearnFolderMapper + +mapper = LearnFolderMapper( + patient_dir=Path(r"E:\XVI_COLLECTION\processed\20230403_Flinders\patient_12345678"), + anon_id="PAT01", + site_name="Prostate", + output_base=Path(r"E:\LEARN_OUTPUT"), +) + +# Preview (dry run) +summary = mapper.execute(dry_run=True) + +# Run for real +summary = mapper.execute( + anon_ct_dir=Path(r"E:\staging\patient_12345678\CT_SET"), + anon_plan_dir=Path(r"E:\staging\patient_12345678\DICOM_PLAN"), + dry_run=False, +) +``` + +#### PII Verification + +```python +from pathlib import Path +from learn_upload.verify_pii import verify_no_pii + +findings = verify_no_pii( + directory=Path(r"E:\LEARN_OUTPUT\Prostate\Patient Plans\PAT01"), + pii_strings=["12345678", "SMITH", "JOHN"], +) +if findings: + print("PII detected!") +``` + +#### Full End-to-End Pipeline + +```python +from pathlib import Path +from learn_upload.anonymise_dicom import DicomAnonymiser +from learn_upload.folder_sort import LearnFolderMapper +from learn_upload.verify_pii import verify_no_pii + +patient = Path(r"E:\XVI_COLLECTION\processed\20230403_Flinders\patient_12345678") +staging = Path(r"E:\staging\patient_12345678") +output = Path(r"E:\LEARN_OUTPUT") + +# Step 1: Anonymise DICOM files +anon = DicomAnonymiser(patient, "PAT01", staging) +anon.anonymise_all() + +# Step 2: Map folders and copy files +mapper = LearnFolderMapper(patient, "PAT01", "Prostate", output) +summary = mapper.execute( + anon_ct_dir=staging / "CT_SET", + anon_plan_dir=staging / "DICOM_PLAN", +) +print(f"Copied {summary['sessions']} sessions across {summary['fractions']} fractions") + +# Step 3: Verify no residual PII +findings = verify_no_pii(output / "Prostate" / "Patient Plans" / "PAT01", ["12345678"]) +``` + +### Utility Functions + +The `utils.py` module provides reusable parsing functions: + +```python +from learn_upload.utils import ( + parse_xvi_ini, # Parse Elekta XVI INI files for patient/scan metadata + parse_scan_datetime, # Extract datetime from ScanUID strings + parse_frames_xml, # Parse _Frames.xml for treatment ID, acquisition preset, kV/mA + parse_couch_shifts, # Extract couch shift values from INI text + extract_ini_from_rps, # Extract ZIP-embedded INI from RPS DICOM files +) +``` + +### Session Types + +The mapper classifies XVI acquisitions by their `AcquisitionPresetName` in `_Frames.xml`: + +| Type | Preset example | Destination | +|------|---------------|-------------| +| CBCT | `4ee Pelvis Soft S20 179-181` | `.his` to `CBCT Projections/IPS/`, `.SCAN` to `Reconstructed CBCT/` | +| KIM Learning | `12aa KIM S20 R 34-181` | Same as CBCT (treated identically) | +| KIM MotionView | `13a KIM S20 MotionView` | `.his` to `KIM-KV/{img_dirname}/` | + +### Expected Input: XVI Export Directory + +The tool expects an Elekta XVI export with this structure: + +``` +patient_XXXXXXXX/ + IMAGES/ + img_/ <- one directory per acquisition session + _Frames.xml <- treatment ID, acquisition preset, kV/mA + *.his <- raw X-ray projections + Reconstruction/ + *.INI or *.INI.XVI <- ScanUID (embeds datetime), patient metadata + *.SCAN <- reconstructed CBCT volume + *.SCAN.MACHINEORIENTATION <- machine orientation metadata + *.RPS.dcm <- registration DICOM (couch shifts, matrices) + img_/ ... <- additional sessions + CT_SET/ <- reference CT DICOM (optional, for anonymisation) + *.dcm + DICOM_PLAN/ <- treatment plan DICOM (optional, for anonymisation) + *.dcm +``` + +The `IMAGES/` subdirectory name is configurable (`images_subdir` parameter); the default is `"IMAGES"`. + +### Optional Inputs + +These are passed via the GUI or Python API when available: + +| Input | Description | +|-------|-------------| +| **TPS Export directory** | Contains `DICOM CT Images/`, `DICOM RT Plan/`, `DICOM RT Structures/`, `DICOM RT Dose/` subdirectories from Monaco/TPS export | +| **Trajectory log directory** | Contains `FX01/`, `FX02/`, ... subdirectories with `MarkerLocations*.txt` files | +| **Centroid file** | Single `.txt` file (e.g. `Centroid_12345678_BeamID_1.1_1.2.txt`); MRN in filename is replaced with anon ID during copy | + +### Output: LEARN Directory Structure + +``` +/ e.g. "Prostate" + Patient Files// + cbct_shift_report.md <- generated CBCT shift report + Centroid__BeamID_*.txt <- anonymised centroid file (if provided) + Patient Images// + FX1/ <- fractions numbered chronologically + CBCT/ + CBCT1/ <- multiple CBCTs per fraction if same-day + CBCT Projections/ + CDOG/ + IPS/ + *.his <- projection files + _Frames.xml + Reconstructed CBCT/ + *.SCAN <- volume files + *.INI / *.INI.XVI <- metadata + Registration file/ + *.RPS.dcm <- registration DICOM + CBCT2/ ... + KIM-KV/ <- MotionView sessions (if present) + img_/ + *.his + _Frames.xml + FX2/ ... + Patient Plans// + CT/ <- anonymised CT DICOM + Plan/ <- anonymised plan DICOM + Dose/ <- anonymised dose DICOM (if provided) + Structure Set/ <- anonymised structure DICOM (if provided) + Ground Truth// + Trajectory Logs// <- only if trajectory logs provided + FX01/ + Trajectory Logs/ + MarkerLocations*.txt + Treatment Records/ + FX02/ ... +``` + +## CBCT Shift Analysis + +The `cbct-shifts/` directory contains scripts for comparing XVI RPS registration data with Mosaiq CBCT shift records: + +- `compare_rps_mosaiq.py` -- matches RPS DICOM registrations to Mosaiq log entries by date/time and prints a side-by-side 6-DOF comparison +- `report_patient_details.py` -- generates patient-level CBCT shift reports + +## Standalone Scripts + +| Script | Description | +|--------|-------------| +| [`scripts/extract_elekta_rps_matrices.py`](scripts/extract_elekta_rps_matrices.py) | Extract XVI RPS registration matrices and alignment data from `.RPS.dcm` files | +| [`scripts/read_dicom_tags.py`](scripts/read_dicom_tags.py) | Read and display DICOM tags from any `.dcm` file | +| [`scripts/elektafdt_crawler.py`](scripts/elektafdt_crawler.py) | Crawl XVI export directories and list treatment plans from `_Frames.xml` | + +## Examples + +See [`examples/run_patient_example.py`](examples/run_patient_example.py) for a complete end-to-end pipeline example using `DicomAnonymiser`, `LearnFolderMapper`, and `verify_no_pii`. + +## Documentation + +- **[GUI Walkthrough](Docs/GUI_Walkthrough.md)** -- step-by-step guide to the LEARN Pipeline GUI with screenshots +- [GC Elekta Patient Upload Process](Docs/GC_Elekta_Patient_Upload_Process.md) -- SOP for patient data transfer to LEARN +- [LEARN Upload Automation Plan](Docs/LEARN_Upload_Automation_Plan.md) -- full automation plan for the pipeline +- [Elekta XVI Reconstruction Directory Analysis](Docs/Elekta_XVI_Reconstruction_Directory_Analysis.md) -- directory and file breakdown +- [Elekta XVI RPS Format Documentation](Docs/elekta_rps_format_documentation.md) -- RPS DICOM file format and coordinate mapping +- [Experimental Validation Notes](Docs/elekta_xvi_sro_experimental_validation.md) -- validation by phantom measurement + +## Running Tests + +```bash +# All tests +python -m pytest tests/ -v + +# Specific modules +python -m pytest tests/test_utils.py -v +python -m pytest tests/test_anonymise_dicom.py -v +python -m pytest tests/test_folder_sort.py -v +python -m pytest tests/test_verify_pii.py -v +``` + +## Background: Target ROI Registration + +The LEARN trial requires that the *Target* contour is visible in at least 180 degrees of x-ray projections during CBCT acquisition. The current markerless tracking model cannot be trained otherwise. + +Plan: +- Verify export process from XVI as described +- Validate exported RPS/SRO objects with ground truth phantom measurements +- Expand the contour alignment tool by ImageX to use the SRO object to align contours to projections +- Use this to screen whether a patient is suitable for recruitment diff --git a/cbct-shifts/compare_rps_mosaiq.py b/cbct-shifts/compare_rps_mosaiq.py new file mode 100644 index 0000000..7f088e9 --- /dev/null +++ b/cbct-shifts/compare_rps_mosaiq.py @@ -0,0 +1,523 @@ +#!/usr/bin/env python3 +""" +Compare Mosaiq CBCT shift records with Elekta XVI RPS DICOM data. + +Parses the Mosaiq tab-delimited log and RPS DICOM files, matches them by +date/time, and prints a side-by-side comparison to determine the coordinate +mapping between the two systems. +""" + +import csv +import io +import re +import sys +from contextlib import redirect_stdout +from datetime import datetime, timedelta +from pathlib import Path + +import numpy as np +import pydicom + +# --------------------------------------------------------------------------- +# Import ElektaRPSExtractor from the repo root +# --------------------------------------------------------------------------- +REPO_ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(REPO_ROOT / "scripts")) +from extract_elekta_rps_matrices import ElektaRPSExtractor + + +# =================================================================== +# SECTION A -- Parse Mosaiq log +# =================================================================== + +def parse_direction_value(text): + """Parse a directional string like 'Sup 0.1 cm' into a signed float. + + Sign convention (Mosaiq): + Translations: Sup=+, Inf=−, Lft=+, Rht=−, Ant=+, Pos=− + Rotations: CW=+, CCW=− + Returns None for empty/missing values. + """ + if not text or not text.strip(): + return None + + text = text.strip() + m = re.match(r'(Sup|Inf|Lft|Rht|Ant|Pos|CW|CCW)\s+([\d.]+)\s*(cm|deg\.?)?', text) + if not m: + return None + + direction, magnitude = m.group(1), float(m.group(2)) + negative_dirs = {'Inf', 'Rht', 'Pos', 'CCW'} + return -magnitude if direction in negative_dirs else magnitude + + +def parse_mosaiq_log(filepath): + """Read the Mosaiq CBCT shifts TSV and return a list of record dicts.""" + records = [] + + with open(filepath, 'r', encoding='utf-8-sig') as f: + reader = csv.reader(f, delimiter='\t', quotechar='"') + header = next(reader) # skip header row + + # Collect all rows first (multi-line quoted Comments span rows) + rows = list(reader) + + # The TSV has multi-line Comments fields. csv.reader handles quoted + # newlines, but each "logical row" may map to one record. We identify + # records by the Date/Time column (index 1) being non-empty. + for row in rows: + # Skip rows that are continuations of a previous multi-line field + if len(row) < 12 or not row[1].strip(): + continue + + try: + dt = datetime.strptime(row[1].strip(), "%d/%m/%Y %I:%M %p") + except ValueError: + continue + + rec = { + 'datetime': dt, + 'type': row[4].strip() if len(row) > 4 else '', + 'sup': parse_direction_value(row[11]) if len(row) > 11 else None, + 'lat': parse_direction_value(row[12]) if len(row) > 12 else None, + 'ant': parse_direction_value(row[13]) if len(row) > 13 else None, + 'cor_b': parse_direction_value(row[15]) if len(row) > 15 else None, + 'sag_b': parse_direction_value(row[16]) if len(row) > 16 else None, + 'trans_b': parse_direction_value(row[17]) if len(row) > 17 else None, + } + + # Parse magnitude (scalar, no direction) + if len(row) > 14 and row[14].strip(): + try: + rec['mag'] = float(row[14].strip()) + except ValueError: + rec['mag'] = None + else: + rec['mag'] = None + + # Flag records with no shift data + rec['has_shifts'] = any( + rec[k] is not None for k in ['sup', 'lat', 'ant'] + ) + + records.append(rec) + + return records + + +# =================================================================== +# SECTION B -- Extract RPS data +# =================================================================== + +def find_rps_files(base_path): + """Glob for RPS DICOM files and extract FX/CBCT labels from path.""" + pattern = "**/Registration file/*.RPS.dcm" + rps_files = sorted(base_path.glob(pattern)) + + results = [] + for p in rps_files: + # Path looks like: .../PAT01/FX1/CBCT/CBCT1/Registration file/xxx.RPS.dcm + parts = p.parts + fx_label = cbct_label = None + for i, part in enumerate(parts): + if part.startswith('FX'): + fx_label = part + if part.startswith('CBCT') and part != 'CBCT': + cbct_label = part + results.append({ + 'path': p, + 'fx': fx_label, + 'cbct': cbct_label, + }) + return results + + +def extract_rps_data(rps_path): + """Extract all relevant data from an RPS DICOM file.""" + # Use ElektaRPSExtractor but suppress its stdout prints + extractor = ElektaRPSExtractor(str(rps_path)) + buf = io.StringIO() + with redirect_stdout(buf): + extractor.extract_all() + + data = {} + + # Alignment date/time from INI -- parse DateTime=YYYYMMDD; HH:MM:SS + ini = extractor.ini_content or '' + dt_match = re.search(r'DateTime=(\d{8});\s*(\d{2}:\d{2}:\d{2})', ini) + if dt_match: + data['ini_datetime'] = datetime.strptime( + f"{dt_match.group(1)} {dt_match.group(2)}", "%Y%m%d %H:%M:%S" + ) + else: + data['ini_datetime'] = None + + # DICOM ContentDate/ContentTime as backup + dcm = extractor.dcm + if hasattr(dcm, 'ContentDate') and hasattr(dcm, 'ContentTime'): + cd = dcm.ContentDate # YYYYMMDD + ct = dcm.ContentTime # HHMMSS.ffffff + ct_clean = ct.split('.')[0] # drop fractional seconds + data['dicom_datetime'] = datetime.strptime( + f"{cd} {ct_clean}", "%Y%m%d %H%M%S" + ) + else: + data['dicom_datetime'] = None + + # Alignment info from extractor + ai = extractor.alignment_info + + # Couch shifts + data['couch_shifts'] = ai.get('couch_shifts', {}) + + # Clipbox alignment + data['clipbox'] = ai.get('clipbox', {}) + + # Mask alignment + data['mask'] = ai.get('mask', {}) + + # Correction matrix + data['correction_matrices'] = extractor.matrices.get('correction', []) + + # Raw INI content for debugging + data['ini_content'] = extractor.ini_content + + return data + + +# =================================================================== +# SECTION C -- Match by date/time +# =================================================================== + +def match_records(mosaiq_records, rps_records, tolerance_min=15): + """Match RPS records to Mosaiq records by date + closest time.""" + tolerance = timedelta(minutes=tolerance_min) + matches = [] + + for rps in rps_records: + rps_dt = rps['data']['ini_datetime'] or rps['data']['dicom_datetime'] + if rps_dt is None: + continue + + best_match = None + best_delta = None + + for mq in mosaiq_records: + if not mq['has_shifts']: + continue + if mq['datetime'].date() != rps_dt.date(): + continue + delta = abs(mq['datetime'] - rps_dt) + if delta <= tolerance and (best_delta is None or delta < best_delta): + best_match = mq + best_delta = delta + + matches.append({ + 'rps': rps, + 'mosaiq': best_match, + 'rps_datetime': rps_dt, + 'time_delta': best_delta, + }) + + return matches + + +# =================================================================== +# SECTION D -- Comparison output +# =================================================================== + +def unwrap_angle(deg): + """Unwrap angle from [0,360) to [-180,180). e.g. 359.8 -> -0.2.""" + if deg is None: + return None + if deg > 180: + return deg - 360 + return deg + + +def fmt(val, width=8): + """Format a float or None for table display.""" + if val is None: + return ' ' * width + return f"{val:>{width}.3f}" + + +def print_comparison(matches): + """Print detailed comparison and mapping analysis.""" + print("=" * 90) + print("MOSAIQ vs RPS SHIFT COMPARISON") + print("=" * 90) + + all_diffs = [] + + for m in matches: + rps = m['rps'] + mq = m['mosaiq'] + rps_data = rps['data'] + cs = rps_data['couch_shifts'] + cb = rps_data['clipbox'] + mk = rps_data['mask'] + + print(f"\n{'-' * 90}") + print(f" {rps['fx']}/{rps['cbct']}") + print(f" RPS datetime: {m['rps_datetime']}") + if mq: + print(f" Mosaiq datetime: {mq['datetime']} (delta: {m['time_delta']})") + else: + print(" Mosaiq: NO MATCH") + continue + + print() + print(f" {'Field':<22} {'Mosaiq':>10} {'CouchShift':>12} {'Clipbox':>12} {'Mask':>12}") + print(f" {'-' * 22} {'-' * 10} {'-' * 12} {'-' * 12} {'-' * 12}") + + # Translations + row_data = [ + ('Sup/Long (Sup+)', 'sup', 'longitudinal', 'longitudinal', 'longitudinal'), + ('Lat (Lft+)', 'lat', 'lateral', 'lateral', 'lateral'), + ('Ant/Vert (Ant+)', 'ant', 'vertical', 'vertical', 'vertical'), + ] + + for label, mq_key, cs_key, cb_key, mk_key in row_data: + mq_val = mq.get(mq_key) + cs_val = cs.get(cs_key) + cb_val = cb.get(cb_key) + mk_val = mk.get(mk_key) + print(f" {label:<22} {fmt(mq_val, 10)} {fmt(cs_val, 12)} {fmt(cb_val, 12)} {fmt(mk_val, 12)}") + + # Rotations — axes are permuted between Mosaiq and Clipbox: + # Mq Cor(B) = CB roll (same sign) + # Mq Sag(B) = CB rotation (same sign) + # Mq Trans(B) = -CB pitch (negated) + rot_data = [ + ('Cor(B) = CB Roll', 'cor_b', 'roll', 'roll'), + ('Sag(B) = CB Rot', 'sag_b', 'rotation', 'rotation'), + ('Trans(B) = -CB Ptch','trans_b','pitch', 'pitch'), + ] + + print() + for label, mq_key, cb_key, mk_key in rot_data: + mq_val = mq.get(mq_key) + cb_val = unwrap_angle(cb.get(cb_key)) + mk_val = unwrap_angle(mk.get(mk_key)) + # Negate pitch for Trans mapping display + neg = " (neg)" if cb_key == 'pitch' else "" + print(f" {label:<22} {fmt(mq_val, 10)} {'':>12} {fmt(cb_val, 12)}{neg} {fmt(mk_val, 12)}") + + # Collect differences for mapping analysis + diff = { + 'label': f"{rps['fx']}/{rps['cbct']}", + 'mq': mq, + 'cs': cs, + 'cb': cb, + 'mk': mk, + } + all_diffs.append(diff) + + # --------------------------------------------------------------- + # Summary table + # --------------------------------------------------------------- + print(f"\n\n{'=' * 90}") + print("SUMMARY TABLE -- All Matches") + print("=" * 90) + + header = ( + f" {'FX/CBCT':<12}" + f" {'Mq Sup':>8} {'CS Long':>8} {'CB Long':>8} {'Mk Long':>8}" + f" {'Mq Lat':>8} {'CS Lat':>8} {'CB Lat':>8} {'Mk Lat':>8}" + f" {'Mq Ant':>8} {'CS Vert':>8} {'CB Vert':>8} {'Mk Vert':>8}" + ) + print(header) + print(" " + "-" * (len(header) - 2)) + + for d in all_diffs: + mq, cs, cb, mk = d['mq'], d['cs'], d['cb'], d['mk'] + print( + f" {d['label']:<12}" + f" {fmt(mq['sup'])} {fmt(cs.get('longitudinal'))} {fmt(cb.get('longitudinal'))} {fmt(mk.get('longitudinal'))}" + f" {fmt(mq['lat'])} {fmt(cs.get('lateral'))} {fmt(cb.get('lateral'))} {fmt(mk.get('lateral'))}" + f" {fmt(mq['ant'])} {fmt(cs.get('vertical'))} {fmt(cb.get('vertical'))} {fmt(mk.get('vertical'))}" + ) + + # --------------------------------------------------------------- + # Rotation summary (axes paired by confirmed mapping) + # --------------------------------------------------------------- + print(f"\n {'FX/CBCT':<12}" + f" {'Mq Cor':>8} {'CB Roll':>8}" + f" {'Mq Sag':>8} {'CB Rot':>8}" + f" {'Mq Trn':>8} {'-CB Pch':>8}") + print(" " + "-" * 70) + + for d in all_diffs: + mq, cb, mk = d['mq'], d['cb'], d['mk'] + neg_pitch = unwrap_angle(cb.get('pitch')) + if neg_pitch is not None: + neg_pitch = -neg_pitch + print( + f" {d['label']:<12}" + f" {fmt(mq['cor_b'])} {fmt(unwrap_angle(cb.get('roll')))}" + f" {fmt(mq['sag_b'])} {fmt(unwrap_angle(cb.get('rotation')))}" + f" {fmt(mq['trans_b'])} {fmt(neg_pitch)}" + ) + + # --------------------------------------------------------------- + # Mapping analysis + # --------------------------------------------------------------- + print(f"\n\n{'=' * 90}") + print("MAPPING ANALYSIS") + print("=" * 90) + + # Check each candidate mapping between Mosaiq and RPS fields + axis_pairs = [ + # (mosaiq_key, rps_source, rps_key, description) + ('sup', 'CouchShift', 'longitudinal', 'Mq Sup <-> CS Longitudinal'), + ('sup', 'Clipbox', 'longitudinal', 'Mq Sup <-> CB Longitudinal'), + ('sup', 'Mask', 'longitudinal', 'Mq Sup <-> Mk Longitudinal'), + ('lat', 'CouchShift', 'lateral', 'Mq Lat <-> CS Lateral'), + ('lat', 'Clipbox', 'lateral', 'Mq Lat <-> CB Lateral'), + ('lat', 'Mask', 'lateral', 'Mq Lat <-> Mk Lateral'), + ('ant', 'CouchShift', 'vertical', 'Mq Ant <-> CS Vertical'), + ('ant', 'Clipbox', 'vertical', 'Mq Ant <-> CB Vertical'), + ('ant', 'Mask', 'vertical', 'Mq Ant <-> Mk Vertical'), + ] + + # Confirmed cross-axis rotation mapping + rot_pairs = [ + ('cor_b', 'Clipbox', 'roll', 'Mq Cor(B) = CB Roll'), + ('sag_b', 'Clipbox', 'rotation', 'Mq Sag(B) = CB Rotation'), + ('trans_b', 'Clipbox', 'neg_pitch','Mq Trans(B) = -CB Pitch'), + ] + + rotation_keys = {'rotation', 'pitch', 'roll', 'neg_pitch'} + + def analyse_pairs(pairs, all_diffs): + for mq_key, rps_src, rps_key, desc in pairs: + src_map = {'CouchShift': 'cs', 'Clipbox': 'cb', 'Mask': 'mk'} + src = src_map[rps_src] + is_rotation = rps_key in rotation_keys + # Handle negated pitch: look up 'pitch' and negate + negate = rps_key == 'neg_pitch' + actual_key = 'pitch' if negate else rps_key + + mag_matches = 0 + sign_matches = 0 + sign_flips = 0 + total = 0 + + for d in all_diffs: + mq_val = d['mq'].get(mq_key) + rps_val = d[src].get(actual_key) + if is_rotation and rps_val is not None: + rps_val = unwrap_angle(rps_val) + if negate and rps_val is not None: + rps_val = -rps_val + if mq_val is None or rps_val is None: + continue + total += 1 + if abs(abs(mq_val) - abs(rps_val)) < 0.05: + mag_matches += 1 + if abs(mq_val) > 0.001 and abs(rps_val) > 0.001: + if (mq_val > 0) == (rps_val > 0): + sign_matches += 1 + else: + sign_flips += 1 + + if total == 0: + continue + + sign_info = "" + if sign_matches > 0 and sign_flips == 0: + sign_info = "SAME sign" + elif sign_flips > 0 and sign_matches == 0: + sign_info = "FLIPPED sign" + elif sign_matches > 0 and sign_flips > 0: + sign_info = f"MIXED ({sign_matches} same, {sign_flips} flip)" + else: + sign_info = "all zero" + + print(f" {desc:<35} mag={mag_matches}/{total} {sign_info}") + + print("\n Translations:") + analyse_pairs(axis_pairs, all_diffs) + print("\n Rotations (confirmed cross-axis mapping):") + analyse_pairs(rot_pairs, all_diffs) + + # --------------------------------------------------------------- + # Confirmed mapping summary + # --------------------------------------------------------------- + print(f"\n\n{'=' * 90}") + print("CONFIRMED 6-DOF MAPPING: Mosaiq <-> RPS Clipbox") + print("=" * 90) + print(""" + TRANSLATIONS (cm, Clipbox values direct; CouchShift values negated): + Mosaiq Sup/Inf = Clipbox longitudinal = -CouchShiftLong + Mosaiq Lft/Rht = Clipbox lateral = -CouchShiftLat + Mosaiq Ant/Pos = Clipbox vertical = -CouchShiftHeight + + ROTATIONS (deg, Clipbox values with angle unwrap >180 -> negative): + Mosaiq Cor(B) = Clipbox roll (same sign) + Mosaiq Sag(B) = Clipbox rotation (same sign) + Mosaiq Trans(B) = -Clipbox pitch (negated) + + NOTE: Rotation axes are PERMUTED between Mosaiq and XVI Clipbox. + NOTE: CouchPitch/CouchRoll/CouchYaw are unavailable in these RPS files. + + MATRIX INFO: Each RPS contains 2 matrices: + - OnlineToRefTransformUnMatched: fixed coord rotation (same for all fractions) + - OnlineToRefTransformCorrection: includes patient-specific correction + The Clipbox INI values are sufficient; matrix decomposition is not needed. +""") + + +# =================================================================== +# Main +# =================================================================== + +def main(): + script_dir = Path(__file__).resolve().parent + mosaiq_path = script_dir / "CBCT-shifts-from-mosaiq.txt" + pat_base = REPO_ROOT / "output" / "Prostate" / "Patient Images" / "PAT01" + + # --- A: Parse Mosaiq --- + print("Parsing Mosaiq log...") + mosaiq_records = parse_mosaiq_log(mosaiq_path) + print(f" Found {len(mosaiq_records)} Mosaiq records " + f"({sum(1 for r in mosaiq_records if r['has_shifts'])} with shifts)") + for r in mosaiq_records: + shifts = (f"Sup={r['sup']}, Lat={r['lat']}, Ant={r['ant']}" + if r['has_shifts'] else "NO SHIFTS") + print(f" {r['datetime']} {shifts}") + + # --- B: Find and extract RPS files --- + print(f"\nSearching for RPS files under {pat_base} ...") + rps_files = find_rps_files(pat_base) + print(f" Found {len(rps_files)} RPS files") + + for rf in rps_files: + print(f"\n Extracting {rf['fx']}/{rf['cbct']}: {rf['path'].name}") + rf['data'] = extract_rps_data(rf['path']) + d = rf['data'] + print(f" INI datetime: {d['ini_datetime']}") + print(f" DICOM datetime: {d['dicom_datetime']}") + cs = d['couch_shifts'] + print(f" CouchShift: Lat={cs.get('lateral')}, " + f"Long={cs.get('longitudinal')}, Vert={cs.get('vertical')}") + cb = d['clipbox'] + print(f" Clipbox: Lat={cb.get('lateral')}, " + f"Long={cb.get('longitudinal')}, Vert={cb.get('vertical')}, " + f"Rot={cb.get('rotation')}, Pitch={cb.get('pitch')}, " + f"Roll={cb.get('roll')}") + + # --- C: Match records --- + print("\n\nMatching RPS records to Mosaiq records...") + matches = match_records(mosaiq_records, rps_files) + + matched = sum(1 for m in matches if m['mosaiq'] is not None) + print(f" {matched}/{len(matches)} RPS files matched to Mosaiq records") + + # --- D: Print comparison --- + print_comparison(matches) + + +if __name__ == "__main__": + main() diff --git a/cbct-shifts/report_patient_details.py b/cbct-shifts/report_patient_details.py new file mode 100644 index 0000000..f05ffbf --- /dev/null +++ b/cbct-shifts/report_patient_details.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 +""" +Generate a markdown report of CBCT shift details for a patient. + +Extracts correction values from RPS DICOM files and converts them to +Mosaiq-convention shifts (Sup/Lat/Ant translations + Cor/Sag/Trans rotations). +""" + +import sys +from collections import OrderedDict +from datetime import datetime +from pathlib import Path + +# Import shared utilities from compare_rps_mosaiq (same directory) +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from compare_rps_mosaiq import ( + find_rps_files, + extract_rps_data, + unwrap_angle, + REPO_ROOT, +) + +# RedCap template header fields with defaults. +# Edit these values to match the site/protocol before generating reports. +REDCAP_DEFAULTS = OrderedDict([ + ("RedCap ID", ""), + ("Image Collected", "CBCTs"), + ("Linac Type", "Versa HD"), + ("Imager Position (SDD)", "150 cm"), + ("Couch Type", "Precise Table/Hexapod"), + ("Coordinate System", ""), + ("kV", "120"), + ("mAs", "25"), + ("Marker Length and Type", ""), + ("Cdog Version", ""), +]) + + +def clipbox_to_mosaiq(clipbox): + """Convert RPS Clipbox values to Mosaiq-convention shifts. + + Returns dict with Mosaiq field names and signed values. + """ + return { + 'sup': clipbox.get('longitudinal', 0.0), + 'lat': clipbox.get('lateral', 0.0), + 'ant': clipbox.get('vertical', 0.0), + 'cor': unwrap_angle(clipbox.get('roll', 0.0)) or 0.0, + 'sag': unwrap_angle(clipbox.get('rotation', 0.0)) or 0.0, + 'trans': -(unwrap_angle(clipbox.get('pitch', 0.0)) or 0.0), + } + + +def generate_report(patient_path): + """Generate markdown report for a patient directory.""" + patient_path = Path(patient_path) + patient_id = patient_path.name + + # Find and extract all RPS files + rps_files = find_rps_files(patient_path) + if not rps_files: + print(f"No RPS files found under {patient_path}") + return None + + # Extract data and convert to Mosaiq format + rows = [] + for rf in rps_files: + data = extract_rps_data(rf['path']) + dt = data['ini_datetime'] or data['dicom_datetime'] + mq = clipbox_to_mosaiq(data.get('clipbox', {})) + rows.append({ + 'fx': rf['fx'] or '?', + 'cbct': rf['cbct'] or '?', + 'datetime': dt, + 'date_str': dt.strftime('%d/%m/%Y') if dt else '?', + 'time_str': dt.strftime('%H:%M') if dt else '?', + **mq, + }) + + # Sort by datetime + rows.sort(key=lambda r: r['datetime'] or datetime.min) + + # Build markdown + header = OrderedDict(REDCAP_DEFAULTS) + header["RedCap ID"] = patient_id + + lines = [ + f"# CBCT Shift Report: {patient_id}", + "", + f"Generated: {datetime.now().strftime('%Y-%m-%d')}", + "", + "## Study Details", + "", + ] + for key, val in header.items(): + lines.append(f"- **{key}:** {val or '—'}") + + lines += [ + "", + "## CBCT Sessions", + "", + "| FX | CBCT | Date | Time | Sup (cm) | Lat (cm) | Ant (cm) | Cor (deg) | Sag (deg) | Trans (deg) |", + "|----|------|------|------|----------|----------|----------|-----------|-----------|-------------|", + ] + + for r in rows: + lines.append( + f"| {r['fx']} | {r['cbct']} " + f"| {r['date_str']} | {r['time_str']} " + f"| {r['sup']:.2f} | {r['lat']:.2f} | {r['ant']:.2f} " + f"| {r['cor']:.1f} | {r['sag']:.1f} | {r['trans']:.1f} |" + ) + + lines += [ + "", + "## Mapping Reference", + "", + "Translations: Clipbox long/lat/vert -> Mosaiq Sup/Lat/Ant (same sign, cm)", + "Rotations: CB roll -> Cor(B), CB rotation -> Sag(B), -CB pitch -> Trans(B) (degrees)", + "", + ] + + return "\n".join(lines) + + +def main(): + if len(sys.argv) > 1: + patient_path = Path(sys.argv[1]) + else: + patient_path = REPO_ROOT / "output" / "Prostate" / "Patient Images" / "PAT01" + + report = generate_report(patient_path) + if report is None: + sys.exit(1) + + # Print to stdout + print(report) + + # Write to file + patient_id = patient_path.name + output_path = Path(__file__).resolve().parent / f"{patient_id}_report.md" + output_path.write_text(report, encoding="utf-8") + print(f"\nReport written to {output_path}") + + +if __name__ == "__main__": + main() diff --git a/examples/run_patient_example.py b/examples/run_patient_example.py new file mode 100644 index 0000000..5358afc --- /dev/null +++ b/examples/run_patient_example.py @@ -0,0 +1,113 @@ +"""Run the LEARN pipeline on a patient example. + +Usage: + python examples/run_patient_example.py +""" + +import logging +from pathlib import Path + +from learn_upload.anonymise_dicom import DicomAnonymiser +from learn_upload.config import setup_logging +from learn_upload.folder_sort import LearnFolderMapper +from learn_upload.verify_pii import verify_no_pii + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- +PATIENT_ROOT = Path(r"P:\04_Projects\07_KIM\Trial Patients Data\Patient_12345678") +XVI_EXPORT = PATIENT_ROOT / "XVI Export" +TPS_EXPORT = PATIENT_ROOT / "TPS Export" +CENTROID_FILE = PATIENT_ROOT / "Centroid_12345678_BeamID_1.1_1.2_1.3_1.4.txt" + +ANON_ID = "PAT01" +SITE_NAME = "Prostate" +OUTPUT_BASE = Path(r"C:\Users\kankean.kandasamy\Repo\learn-crawler\output") +STAGING_DIR = OUTPUT_BASE / "_staging" + +logger = logging.getLogger(__name__) + + +def main() -> None: + setup_logging(logging.INFO) + + # ------------------------------------------------------------------ + # 1. Anonymise TPS DICOM files + # ------------------------------------------------------------------ + logger.info("=== Step 1: Anonymise TPS DICOM files ===") + anon = DicomAnonymiser( + patient_dir=PATIENT_ROOT, + anon_id=ANON_ID, + output_dir=STAGING_DIR, + site_name=SITE_NAME, + ) + + # Classify and anonymise each DICOM category from TPS Export + categories = { + "ct": TPS_EXPORT / "DICOM CT Images", + "plan": TPS_EXPORT / "DICOM RT Plan", + "structures": TPS_EXPORT / "DICOM RT Structures", + } + + anon_dirs = {} + for category, source_dir in categories.items(): + if source_dir.is_dir(): + cat_staging = STAGING_DIR / category + cat_anon = DicomAnonymiser( + patient_dir=PATIENT_ROOT, + anon_id=ANON_ID, + output_dir=cat_staging, + site_name=SITE_NAME, + ) + results = cat_anon.anonymise_all_dcm(source_dir) + anon_dirs[category] = cat_staging + logger.info(" %s: %d files anonymised", category, len(results)) + else: + logger.warning(" %s directory not found: %s", category, source_dir) + + # ------------------------------------------------------------------ + # 2. Run folder mapper (XVI sessions + copy everything) + # ------------------------------------------------------------------ + logger.info("=== Step 2: Folder mapping and file copy ===") + mapper = LearnFolderMapper( + patient_dir=PATIENT_ROOT, + anon_id=ANON_ID, + site_name=SITE_NAME, + output_base=OUTPUT_BASE, + images_subdir="XVI Export", + ) + + summary = mapper.execute( + anon_ct_dir=anon_dirs.get("ct"), + anon_plan_dir=anon_dirs.get("plan"), + anon_struct_dir=anon_dirs.get("structures"), + anon_dose_dir=anon_dirs.get("dose"), + centroid_path=CENTROID_FILE, + trajectory_base_dir=PATIENT_ROOT, # FX01-FX04 are direct children + dry_run=False, + ) + + # ------------------------------------------------------------------ + # 3. Verify no residual PII in output + # ------------------------------------------------------------------ + logger.info("=== Step 3: PII verification ===") + pii_strings = ["12345678", "SMITH", "JOHN"] + output_patient_dir = OUTPUT_BASE / SITE_NAME / "Patient Plans" / ANON_ID + findings = verify_no_pii(output_patient_dir, pii_strings) + if findings: + logger.error("PII DETECTED — review findings above") + else: + logger.info("PII verification passed") + + # ------------------------------------------------------------------ + # 4. Summary + # ------------------------------------------------------------------ + logger.info("=== Pipeline complete ===") + logger.info("Sessions discovered: %d", summary["sessions"]) + logger.info("Fractions assigned: %d", summary["fractions"]) + for key, val in summary["files_copied"].items(): + logger.info(" %-15s %d", key, val) + + +if __name__ == "__main__": + main() diff --git a/learn_upload.spec b/learn_upload.spec new file mode 100644 index 0000000..e667160 --- /dev/null +++ b/learn_upload.spec @@ -0,0 +1,52 @@ +# -*- mode: python ; coding: utf-8 -*- +"""PyInstaller spec for learn_upload GUI.""" + +a = Analysis( + ["learn_upload/__main__.py"], + pathex=["cbct-shifts", "scripts"], + binaries=[], + datas=[], + hiddenimports=[ + "PyQt6.QtCore", + "PyQt6.QtGui", + "PyQt6.QtWidgets", + "pydicom", + "numpy", + "report_patient_details", + "compare_rps_mosaiq", + "extract_elekta_rps_matrices", + ], + hookspath=[], + hooksconfig={}, + runtime_hooks=[], + excludes=[ + "matplotlib", + "scipy", + "pandas", + "tkinter", + "PIL", + "IPython", + "notebook", + "sphinx", + ], + noarchive=False, +) + +pyz = PYZ(a.pure) + +exe = EXE( + pyz, + a.scripts, + a.binaries, + a.datas, + [], + name="learn_upload", + debug=False, + bootloader_ignore_signals=False, + strip=False, + upx=True, + upx_exclude=[], + runtime_tmpdir=None, + console=False, + disable_windowed_traceback=False, +) diff --git a/learn_upload/__init__.py b/learn_upload/__init__.py new file mode 100644 index 0000000..39948c7 --- /dev/null +++ b/learn_upload/__init__.py @@ -0,0 +1,22 @@ +""" +learn_upload — Automation tools for the LEARN data transfer pipeline. + +Transfers Elekta XVI CBCT patient data from GC (GenesisCare) to +USYD RDS/research/PRJ-LEARN, replacing manual steps with Python scripts. + +DICOM Anonymisation Approach +---------------------------- +This package uses raw pydicom for DICOM anonymisation rather than dedicated +anonymisation libraries (deid, dicognito, dicom-anonymizer). Rationale: + +1. We must PRESERVE original DICOM UIDs to maintain referential integrity + between CT, structure set, and plan files. Most anonymisation libraries + replace UIDs by default to break linkage — the opposite of what we need. +2. Our scope is narrow: ~10 specific tags to modify/clear per the LEARN SOP. + A recipe-based system would require configuring exceptions for every tag + we DON'T want touched. +3. pydicom is already a dependency — no additional packages needed. +4. Full control over edge cases in Elekta XVI exports. +""" + +__version__ = "0.1.0" diff --git a/learn_upload/__main__.py b/learn_upload/__main__.py new file mode 100644 index 0000000..320bb96 --- /dev/null +++ b/learn_upload/__main__.py @@ -0,0 +1,19 @@ +"""Entry point for ``python -m learn_upload``. + +Launches the PyQt6 GUI wizard by default. Pass ``--legacy`` to use +the original pywebview GUI instead. +""" + +import sys + + +def main(): + if "--legacy" in sys.argv: + from learn_upload.gui import main as gui_main + else: + from learn_upload.gui_qt import main as gui_main + gui_main() + + +if __name__ == "__main__": + main() diff --git a/learn_upload/anonymise_dicom.py b/learn_upload/anonymise_dicom.py new file mode 100644 index 0000000..8099026 --- /dev/null +++ b/learn_upload/anonymise_dicom.py @@ -0,0 +1,465 @@ +"""DICOM anonymisation for LEARN data transfer pipeline. + +Replaces the manual MIM anonymisation step in the SOP. Handles all file +types that may contain patient-identifiable information: DICOM files, +_Frames.xml, INI configuration files, centroid files, and trajectory logs. +""" + +import logging +import re +import time +import xml.etree.ElementTree as ET +from pathlib import Path + +import pydicom +from pydicom.valuerep import PersonName + +from learn_upload.config import DICOM_TAGS_CLEAR, DICOM_TAGS_REPLACE + +logger = logging.getLogger(__name__) + + +class DicomAnonymiser: + """Anonymise CT and plan DICOM files for a single patient.""" + + def __init__( + self, patient_dir: Path, anon_id: str, output_dir: Path, site_name: str = "" + ) -> None: + self.patient_dir = Path(patient_dir) + self.anon_id = anon_id + self.output_dir = Path(output_dir) + self.site_name = site_name + + if not self.patient_dir.is_dir(): + raise FileNotFoundError( + f"Patient directory does not exist: {self.patient_dir}" + ) + + def _anonymise_filename(self, filename: str) -> str: + """Replace parenthesised patient name in filename with anon_id. + + E.g. ``DCMRT_Plan(SMITH JOHN).dcm`` → ``DCMRT_Plan(PAT01).dcm`` + """ + return re.sub(r"\([^)]+\)", f"({self.anon_id})", filename) + + def anonymise_file(self, dcm_path: Path, source_base: Path = None) -> Path: + """Anonymise a single DICOM file and save to the staging directory. + + Tags in DICOM_TAGS_REPLACE are set to *anon_id*; tags in + DICOM_TAGS_CLEAR are set to empty string (skipped if absent). + All other tags — including UIDs — are left untouched. + + Parameters + ---------- + dcm_path : Path + Path to the DICOM file to anonymise. + source_base : Path, optional + Base directory for computing relative output path. Defaults to + ``self.patient_dir`` (original behaviour). + + Returns the path of the written output file. + """ + dcm = pydicom.dcmread(dcm_path) + + # Capture original PatientID before replacing — used for scrubbing + original_patient_id = str(getattr(dcm, "PatientID", "")) + + for tag in DICOM_TAGS_REPLACE: + if tag == (0x0010, 0x0010): # PatientName + dcm[tag].value = PersonName(f"{self.anon_id}^{self.site_name}") + else: + dcm[tag].value = self.anon_id + + for tag in DICOM_TAGS_CLEAR: + if tag in dcm: + dcm[tag].value = "" + + # Scrub original patient ID from StudyDescription (XVI RPS files + # embed MRN in text like "Tx Plan for 12345678 on ...") + if original_patient_id and hasattr(dcm, "StudyDescription"): + dcm.StudyDescription = dcm.StudyDescription.replace( + original_patient_id, self.anon_id + ) + + # Mirror the subdirectory structure relative to source_base + base = Path(source_base) if source_base is not None else self.patient_dir + relative = dcm_path.relative_to(base) + # Anonymise the filename (replace parenthesised patient name) + anon_name = self._anonymise_filename(relative.name) + output_path = self.output_dir / relative.parent / anon_name + output_path.parent.mkdir(parents=True, exist_ok=True) + + dcm.save_as(output_path) + logger.info("Anonymised %s -> %s", dcm_path.name, output_path) + return output_path + + def anonymise_all_dcm(self, source_dir: Path) -> list[Path]: + """Recursively find and anonymise every ``.dcm`` file under *source_dir*. + + Unlike :meth:`anonymise_ct_set` / :meth:`anonymise_plan`, this method + does not assume any particular subdirectory layout — it simply walks the + tree and anonymises every DICOM file it finds. + + Returns a list of output file paths. + """ + source_dir = Path(source_dir) + if not source_dir.is_dir(): + logger.warning("Source directory does not exist: %s", source_dir) + return [] + + files = sorted(source_dir.rglob("*.dcm"), key=str) + sorted( + source_dir.rglob("*.DCM"), key=str + ) + # Deduplicate (on case-insensitive filesystems *.dcm and *.DCM overlap) + seen: set[Path] = set() + unique: list[Path] = [] + for f in files: + resolved = f.resolve() + if resolved not in seen: + seen.add(resolved) + unique.append(f) + + results = [self.anonymise_file(f, source_base=source_dir) for f in unique] + logger.info( + "anonymise_all_dcm: %d files anonymised under %s", len(results), source_dir + ) + return results + + def _glob_dcm(self, subdir: str) -> list[Path]: + """Return all .dcm/.DCM files under patient_dir/subdir.""" + folder = self.patient_dir / subdir + if not folder.is_dir(): + logger.warning("Directory not found: %s", folder) + return [] + files = sorted( + p for p in folder.iterdir() + if p.suffix.lower() == ".dcm" + ) + if not files: + logger.warning("No DCM files in %s", folder) + return files + + def anonymise_ct_set(self) -> list[Path]: + """Anonymise all DICOM files in CT_SET/.""" + return [self.anonymise_file(f) for f in self._glob_dcm("CT_SET")] + + def anonymise_plan(self) -> list[Path]: + """Anonymise all DICOM files in DICOM_PLAN/.""" + return [self.anonymise_file(f) for f in self._glob_dcm("DICOM_PLAN")] + + def anonymise_frames_xml(self, xml_path: Path, output_path: Path) -> Path: + """Anonymise a ``_Frames.xml`` file, removing patient PII. + + Replaces ````, ````, and ```` with + *anon_id*. Also regex-scrubs the original patient ID from + ```` text (if present). + + Parameters + ---------- + xml_path : Path + Path to the source ``_Frames.xml``. + output_path : Path + Destination path for the anonymised XML. + + Returns + ------- + Path + The written output file path. + """ + tree = ET.parse(xml_path) + root = tree.getroot() + + # Detect original patient ID before replacing it + patient_el = root.find("Patient") + original_id = None + if patient_el is not None: + id_el = patient_el.find("ID") + if id_el is not None and id_el.text: + original_id = id_el.text.strip() + + # Replace PII tags + for tag_name in ("FirstName", "LastName", "ID"): + el = patient_el.find(tag_name) + if el is not None: + if tag_name == "FirstName": + el.text = "" + else: + el.text = self.anon_id + + # Scrub original patient ID from Treatment/Description + if original_id: + treatment_el = root.find("Treatment") + if treatment_el is not None: + desc_el = treatment_el.find("Description") + if desc_el is not None and desc_el.text: + desc_el.text = desc_el.text.replace(original_id, self.anon_id) + + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + tree.write(output_path, encoding="unicode", xml_declaration=True) + logger.info("Anonymised _Frames.xml %s -> %s", xml_path, output_path) + return output_path + + def anonymise_all(self) -> dict: + """Anonymise CT_SET and DICOM_PLAN, returning a summary dict.""" + ct_files = self.anonymise_ct_set() + plan_files = self.anonymise_plan() + summary = { + "ct_count": len(ct_files), + "plan_count": len(plan_files), + "anon_id": self.anon_id, + } + logger.info( + "Anonymisation complete: %d CT, %d plan files (ID: %s)", + summary["ct_count"], + summary["plan_count"], + summary["anon_id"], + ) + return summary + + +# --------------------------------------------------------------------------- +# Standalone anonymisation helpers (operate on already-copied output files) +# --------------------------------------------------------------------------- + + +def anonymise_ini_file(ini_path: Path, anon_id: str) -> None: + """Anonymise an XVI ``.INI`` or ``.INI.XVI`` file in-place. + + Replaces ``PatientID=xxx`` with ``PatientID={anon_id}``, + ``FirstName=xxx`` with ``FirstName=``, and + ``LastName=xxx`` with ``LastName={anon_id}``. + """ + ini_path = Path(ini_path) + text = ini_path.read_text(encoding="utf-8", errors="replace") + + text = re.sub(r"(?m)^PatientID=.*$", f"PatientID={anon_id}", text) + text = re.sub(r"(?m)^FirstName=.*$", "FirstName=", text) + text = re.sub(r"(?m)^LastName=.*$", f"LastName={anon_id}", text) + + ini_path.write_text(text, encoding="utf-8") + logger.info("Anonymised INI %s", ini_path) + + +def anonymise_centroid_file(file_path: Path, anon_id: str) -> Path: + """Anonymise a centroid file in-place. + + Lines 1 and 2 contain MRN and patient name; both are replaced with + *anon_id*. The MRN portion of the filename is also replaced. + + Returns the (possibly renamed) output path. + """ + file_path = Path(file_path) + lines = file_path.read_text(encoding="utf-8", errors="ignore").splitlines( + keepends=True + ) + + original_id = lines[0].strip() if lines else "" + + if len(lines) >= 1: + lines[0] = anon_id + "\n" + if len(lines) >= 2: + lines[1] = anon_id + "\n" + + file_path.write_text("".join(lines), encoding="utf-8") + + # Rename file if MRN appears in the filename + if original_id and original_id in file_path.name: + new_name = file_path.name.replace(original_id, anon_id) + new_path = file_path.parent / new_name + file_path.rename(new_path) + logger.info("Anonymised centroid -> %s", new_path.name) + return new_path + + logger.info("Anonymised centroid %s", file_path) + return file_path + + +def anonymise_trajectory_log(file_path: Path, original_id: str, anon_id: str) -> None: + """Anonymise a trajectory log file in-place. + + Replaces ``patient_{original_id}`` with ``patient_{anon_id}`` in + the file text (used for MarkerLocations*.txt files). + """ + file_path = Path(file_path) + text = file_path.read_text(encoding="utf-8", errors="ignore") + + if original_id: + text = text.replace(f"patient_{original_id}", f"patient_{anon_id}") + + file_path.write_text(text, encoding="utf-8") + logger.info("Anonymised trajectory log %s", file_path) + + +def anonymise_output_folder( + output_dir: Path, + anon_id: str, + site_name: str, + patient_dir: Path, + tps_path: Path = None, + progress_callback=None, +) -> dict: + """Scan an entire output folder and anonymise all files in-place. + + This is the main entry point for the anonymisation step, called after + folder sort has copied raw files into the LEARN directory structure. + + Parameters + ---------- + output_dir : Path + Base output directory (contains *site_name*/ subtree). + anon_id : str + Anonymised patient identifier (e.g. ``PAT01``). + site_name : str + Site/treatment name (e.g. ``Prostate``). + patient_dir : Path + Original XVI patient directory (for extracting original MRN). + tps_path : Path, optional + TPS export directory. If provided, DICOM files are imported and + anonymised into Patient Plans/. + progress_callback : callable, optional + ``callback(current, total, filename)`` for progress reporting. + + Returns + ------- + dict + Summary: ``{"dcm": N, "xml": M, "ini": P, "centroid": Q, + "trajectory": R, "tps_imported": S, "errors": E}``. + """ + output_dir = Path(output_dir) + patient_dir = Path(patient_dir) + site_root = output_dir / Path(site_name).name + + # Detect original patient ID from source directory name + patient_dir_name = patient_dir.name + original_id = "" + if patient_dir_name.lower().startswith("patient_"): + original_id = patient_dir_name[len("patient_"):] + + counts = { + "dcm": 0, "xml": 0, "ini": 0, "centroid": 0, + "trajectory": 0, "tps_imported": 0, "errors": 0, + } + + # --- Phase 1: Collect all files to process --- + files_to_process: list[Path] = [] + if site_root.is_dir(): + for f in site_root.rglob("*"): + if f.is_file(): + files_to_process.append(f) + + total = len(files_to_process) + processed = 0 + last_emit = 0.0 + + def _progress(filename: str) -> None: + nonlocal processed, last_emit + processed += 1 + if progress_callback: + now = time.monotonic() + if now - last_emit >= 0.2 or processed % 10 == 0 or processed == total: + progress_callback(processed, total, filename) + last_emit = now + + # --- Phase 2: Walk and anonymise --- + # Build a DicomAnonymiser that writes in-place (output_dir == source) + anon = DicomAnonymiser( + patient_dir=patient_dir, + anon_id=anon_id, + output_dir=site_root, + site_name=site_name, + ) + + for file_path in sorted(files_to_process): + name_lower = file_path.name.lower() + + try: + if name_lower == "_frames.xml": + anon.anonymise_frames_xml(file_path, file_path) + counts["xml"] += 1 + + elif name_lower.endswith((".ini", ".ini.xvi")): + anonymise_ini_file(file_path, anon_id) + counts["ini"] += 1 + + elif name_lower.endswith(".dcm"): + # In-place: read, anonymise, overwrite + dcm = pydicom.dcmread(file_path) + original_patient_id = str(getattr(dcm, "PatientID", "")) + + for tag in DICOM_TAGS_REPLACE: + if tag == (0x0010, 0x0010): # PatientName + dcm[tag].value = PersonName(f"{anon_id}^{site_name}") + else: + dcm[tag].value = anon_id + + for tag in DICOM_TAGS_CLEAR: + if tag in dcm: + dcm[tag].value = "" + + if original_patient_id and hasattr(dcm, "StudyDescription"): + dcm.StudyDescription = dcm.StudyDescription.replace( + original_patient_id, anon_id + ) + + dcm.save_as(file_path) + counts["dcm"] += 1 + + elif name_lower.startswith("markerlocations") and name_lower.endswith(".txt"): + anonymise_trajectory_log(file_path, original_id, anon_id) + counts["trajectory"] += 1 + + elif "patient files" in str(file_path.parent).lower() and name_lower.endswith(".txt"): + # Centroid files live in Patient Files/{anon_id}/ + anonymise_centroid_file(file_path, anon_id) + counts["centroid"] += 1 + + # .his and .SCAN — skip (binary data) + + except Exception: + logger.exception("Failed to anonymise %s", file_path) + counts["errors"] += 1 + + _progress(file_path.name) + + # --- Phase 3: TPS import (if provided) --- + if tps_path: + tps_path = Path(tps_path) + if tps_path.is_dir(): + plans_root = site_root / "Patient Plans" / anon_id + tps_categories = { + "DICOM CT Images": "CT", + "DICOM RT Plan": "Plan", + "DICOM RT Structures": "Structure Set", + "DICOM RT Dose": "Dose", + } + + # Count TPS files for progress update + tps_files: list[tuple[Path, str]] = [] + for src_name, dest_name in tps_categories.items(): + src_dir = tps_path / src_name + if src_dir.is_dir(): + for dcm_file in sorted(src_dir.rglob("*.dcm")): + tps_files.append((dcm_file, dest_name)) + + total += len(tps_files) + + for dcm_file, dest_name in tps_files: + dest_dir = plans_root / dest_name + dest_dir.mkdir(parents=True, exist_ok=True) + try: + tps_anon = DicomAnonymiser( + patient_dir=patient_dir, + anon_id=anon_id, + output_dir=dest_dir, + site_name=site_name, + ) + tps_anon.anonymise_file(dcm_file, source_base=dcm_file.parent) + counts["tps_imported"] += 1 + except Exception: + logger.exception("Failed to anonymise TPS file %s", dcm_file) + counts["errors"] += 1 + _progress(dcm_file.name) + + logger.info("anonymise_output_folder complete: %s", counts) + return counts diff --git a/learn_upload/config.py b/learn_upload/config.py new file mode 100644 index 0000000..70b1d3f --- /dev/null +++ b/learn_upload/config.py @@ -0,0 +1,76 @@ +""" +Centralised configuration for the learn_upload package. + +Paths, constants, DICOM tag lists, and logging setup used across all modules. +""" + +import logging +from pathlib import Path + +# --------------------------------------------------------------------------- +# Anonymised ID format +# --------------------------------------------------------------------------- +ANON_ID_PREFIX = "PAT" + + +def make_anon_id(n: int) -> str: + """Format a sequential anonymised patient ID, e.g. make_anon_id(1) -> 'PAT01'.""" + return f"{ANON_ID_PREFIX}{n:02d}" + + +# --------------------------------------------------------------------------- +# Default data paths (override via CLI or environment as needed) +# --------------------------------------------------------------------------- +DEFAULT_XVI_BASE = Path(r"E:\XVI_COLLECTION\processed\20230403_Flinders") +DEFAULT_LEARN_OUTPUT = Path(r"E:\LEARN_OUTPUT") + +# --------------------------------------------------------------------------- +# Elekta XVI DICOM private tags +# --------------------------------------------------------------------------- +# RPS DICOM files embed a ZIP archive in this private tag containing .INI.XVI +# registration data. See scripts/extract_elekta_rps_matrices.py for usage. +RPS_ZIP_TAG = (0x0021, 0x103A) + +# --------------------------------------------------------------------------- +# DICOM tags for anonymisation +# --------------------------------------------------------------------------- +# Tags whose value is replaced with the anonymised ID (PATxx). +DICOM_TAGS_REPLACE = { + (0x0010, 0x0010): "PatientName", + (0x0010, 0x0020): "PatientID", + (0x0020, 0x0010): "StudyID", +} + +# Tags that are cleared (set to empty string). +DICOM_TAGS_CLEAR = { + (0x0010, 0x0030): "PatientBirthDate", + (0x0010, 0x1000): "OtherPatientIDs", + (0x0010, 0x1001): "OtherPatientNames", + (0x0008, 0x0050): "AccessionNumber", + (0x0008, 0x0080): "InstitutionName", + (0x0008, 0x0081): "InstitutionAddress", + (0x0008, 0x0090): "ReferringPhysicianName", + (0x0008, 0x1048): "PhysiciansOfRecord", + (0x0008, 0x1070): "OperatorsName", +} + +# Tags explicitly preserved — listed here for documentation; the anonymiser +# simply leaves any tag not in the replace/clear sets untouched. +DICOM_TAGS_PRESERVE = { + (0x0010, 0x0040): "PatientSex", + (0x0010, 0x1010): "PatientAge", + (0x0010, 0x1020): "PatientSize", + (0x0010, 0x1030): "PatientWeight", + (0x0008, 0x1030): "StudyDescription", + # All DICOM UIDs are preserved to maintain referential integrity. +} + +# --------------------------------------------------------------------------- +# Logging helper +# --------------------------------------------------------------------------- +LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + + +def setup_logging(level: int = logging.INFO) -> None: + """Configure root logging for learn_upload scripts.""" + logging.basicConfig(level=level, format=LOG_FORMAT) diff --git a/learn_upload/folder_sort.py b/learn_upload/folder_sort.py new file mode 100644 index 0000000..421ac65 --- /dev/null +++ b/learn_upload/folder_sort.py @@ -0,0 +1,703 @@ +"""Folder mapping and file sorting for the LEARN data transfer pipeline. + +Automates the manual SOP steps of: +1. Discovering XVI acquisition sessions from patient IMAGES/ directories +2. Classifying sessions as CBCT, KIM Learning, or KIM MotionView +3. Assigning sessions to treatment fractions (FX0, FX1, ...) +4. Creating the LEARN hierarchical directory structure +5. Copying files to their correct destinations +""" + +import logging +import re +import shutil +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path +from typing import Optional + +import pydicom + +from learn_upload.utils import ( + extract_ini_from_rps, + parse_couch_shifts, + parse_frames_xml, + parse_scan_datetime, + parse_xvi_ini, +) + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Session dataclass +# --------------------------------------------------------------------------- + +@dataclass +class CBCTSession: + """Represents a single XVI acquisition session (img_* directory).""" + + img_dir: Path + dicom_uid: str + acquisition_preset: str + session_type: str # "cbct" | "kim_learning" | "motionview" + treatment_id: str + scan_datetime: Optional[datetime] = None + tube_kv: Optional[float] = None + tube_ma: Optional[float] = None + has_rps: bool = False + rps_path: Optional[Path] = None + couch_shifts: Optional[dict] = None + ini_path: Optional[Path] = None + + +# --------------------------------------------------------------------------- +# Classification +# --------------------------------------------------------------------------- + +def classify_acquisition(preset_name: str) -> str: + """Classify an acquisition preset into session type. + + Parameters + ---------- + preset_name : str + AcquisitionPresetName from _Frames.xml. + + Returns + ------- + str + ``"motionview"``, ``"kim_learning"``, or ``"cbct"``. + """ + lower = preset_name.lower() + if "motionview" in lower: + return "motionview" + if "kim" in lower: + return "kim_learning" + return "cbct" + + +# --------------------------------------------------------------------------- +# Folder Mapper +# --------------------------------------------------------------------------- + +class LearnFolderMapper: + """Discovers XVI sessions and maps them to the LEARN directory structure.""" + + def __init__( + self, + patient_dir: Path, + anon_id: str, + site_name: str, + output_base: Path, + images_subdir: str = "IMAGES", + ) -> None: + self.patient_dir = Path(patient_dir) + self.anon_id = anon_id + self.site_name = site_name + self.output_base = Path(output_base) + self.images_subdir = images_subdir + + # ----- DICOM classification ----- + + _MODALITY_MAP = { + "CT": "ct", + "RTPLAN": "plan", + "RTSTRUCT": "structures", + "RTDOSE": "dose", + } + + @staticmethod + def classify_dicom_files(source_dir: Path) -> dict[str, list[Path]]: + """Classify ``.dcm`` files by DICOM Modality tag ``(0008,0060)``. + + Recursively walks *source_dir* and reads only the Modality tag from + each ``.dcm`` file to sort them into categories. + + Returns ``{"ct": [...], "plan": [...], "structures": [...], "dose": [...]}``. + Files with unrecognised modality are logged as warnings and excluded. + """ + source_dir = Path(source_dir) + result: dict[str, list[Path]] = { + "ct": [], "plan": [], "structures": [], "dose": [], + } + + if not source_dir.is_dir(): + logger.warning("classify_dicom_files: directory not found: %s", source_dir) + return result + + all_dcm = sorted(source_dir.rglob("*.dcm"), key=str) + sorted( + source_dir.rglob("*.DCM"), key=str + ) + # Deduplicate (case-insensitive filesystems) + seen: set[Path] = set() + unique: list[Path] = [] + for f in all_dcm: + resolved = f.resolve() + if resolved not in seen: + seen.add(resolved) + unique.append(f) + + for dcm_path in unique: + try: + ds = pydicom.dcmread(dcm_path, stop_before_pixels=True) + modality = getattr(ds, "Modality", None) or "" + except Exception: + logger.warning("Could not read DICOM file: %s", dcm_path) + continue + + category = LearnFolderMapper._MODALITY_MAP.get(modality.upper()) + if category: + result[category].append(dcm_path) + else: + logger.warning( + "Unrecognised DICOM modality '%s' in %s — skipping", + modality, + dcm_path, + ) + + return result + + # ----- Discovery ----- + + def discover_sessions(self, enrich: bool = True) -> list[CBCTSession]: + """Scan IMAGES/img_* directories and build session objects. + + Returns + ------- + list[CBCTSession] + Sessions sorted by scan_datetime (None-datetime sessions at end). + """ + images_dir = self.patient_dir / self.images_subdir + if not images_dir.is_dir(): + logger.warning("No IMAGES directory in %s", self.patient_dir) + return [] + + sessions: list[CBCTSession] = [] + for img_dir in sorted(images_dir.iterdir()): + if not img_dir.is_dir() or not img_dir.name.startswith("img_"): + continue + + frames_xml = img_dir / "_Frames.xml" + if not frames_xml.exists(): + logger.warning("No _Frames.xml in %s — skipping", img_dir) + continue + + meta = parse_frames_xml(frames_xml) + if meta.get("acquisition_preset") is None: + logger.warning("No AcquisitionPresetName in %s — skipping", frames_xml) + continue + + session_type = classify_acquisition(meta["acquisition_preset"]) + dicom_uid = meta.get("dicom_uid") or img_dir.name + + session = CBCTSession( + img_dir=img_dir, + dicom_uid=dicom_uid, + acquisition_preset=meta["acquisition_preset"], + session_type=session_type, + treatment_id=meta.get("treatment_id") or "", + tube_kv=meta.get("kv"), + tube_ma=meta.get("ma"), + ) + + # Extract datetime and registration data (all session types) + if enrich: + self._enrich_cbct_session(session) + + sessions.append(session) + + # Sort: sessions with datetime first (chronological), None-datetime at end + sessions.sort(key=lambda s: (s.scan_datetime is None, s.scan_datetime or datetime.min)) + return sessions + + def _enrich_cbct_session(self, session: CBCTSession) -> None: + """Populate datetime, RPS, and couch shifts for a CBCT/KIM Learning session.""" + recon_dir = session.img_dir / "Reconstruction" + + # Find INI file for ScanUID → datetime + if recon_dir.is_dir(): + ini_files = sorted(recon_dir.glob("*.INI")) + if ini_files: + session.ini_path = ini_files[0] + ini_text = ini_files[0].read_text(encoding="utf-8", errors="ignore") + ini_data = parse_xvi_ini(ini_text) + scan_uid = ini_data.get("ScanUID") + if scan_uid: + session.scan_datetime = parse_scan_datetime(scan_uid) + + # Find RPS DICOM → couch shifts + rps_files = sorted(session.img_dir.glob("Reconstruction/*.dcm")) + if not rps_files: + rps_files = sorted(session.img_dir.glob("Reconstruction/*.RPS.dcm")) + if rps_files: + session.rps_path = rps_files[0] + session.has_rps = True + ini_text = extract_ini_from_rps(rps_files[0]) + if ini_text: + session.couch_shifts = parse_couch_shifts(ini_text) + + # ----- MotionView date matching ----- + + def _match_motionview_dates( + self, + dated: list[CBCTSession], + undated: list[CBCTSession], + ) -> None: + """Assign scan_datetime to undated sessions by treatment_id and directory proximity. + + Matching strategy (in priority order): + 1. Same treatment_id → nearest by directory sort position + 2. No treatment_id match → nearest directory sort position overall + + All sessions share a parent directory; img_* names are sequential UIDs + so alphabetical proximity correlates with temporal proximity. + + Mutates undated sessions in place. + """ + if not dated or not undated: + return + + # Build a sorted directory index for proximity lookups + all_sessions = dated + undated + sorted_names = sorted(s.img_dir.name for s in all_sessions) + name_to_pos = {name: i for i, name in enumerate(sorted_names)} + + # Build lookup: treatment_id → list of dated sessions + by_treatment: dict[str, list[CBCTSession]] = {} + for d in dated: + tid = d.treatment_id.strip() + if tid: + by_treatment.setdefault(tid, []).append(d) + + for mv_session in undated: + best_match: Optional[CBCTSession] = None + mv_pos = name_to_pos.get(mv_session.img_dir.name, 0) + + # Strategy 1: match by treatment_id, pick nearest directory + tid = mv_session.treatment_id.strip() + candidates = by_treatment.get(tid, []) if tid else [] + if candidates: + best_match = min( + candidates, + key=lambda d: abs(name_to_pos.get(d.img_dir.name, 0) - mv_pos), + ) + + # Strategy 2: fallback to nearest directory overall + if best_match is None: + best_match = min( + dated, + key=lambda d: abs(name_to_pos.get(d.img_dir.name, 0) - mv_pos), + ) + + if best_match and best_match.scan_datetime: + mv_session.scan_datetime = best_match.scan_datetime + logger.info( + "Matched undated session %s → %s (treatment=%s, date=%s)", + mv_session.img_dir.name, + best_match.img_dir.name, + best_match.treatment_id, + best_match.scan_datetime.strftime("%Y-%m-%d"), + ) + else: + logger.warning( + "Could not match undated session %s to any dated session", + mv_session.img_dir.name, + ) + + # ----- Fraction assignment ----- + + def assign_fractions(self, sessions: list[CBCTSession]) -> dict[str, list[CBCTSession]]: + """Group sessions into fractions by date. + + Parameters + ---------- + sessions : list[CBCTSession] + All sessions with scan_datetime assigned. + + Returns + ------- + dict[str, list[CBCTSession]] + ``{"FX0": [...], "FX1": [...], ...}`` sorted chronologically. + """ + # Sort all sessions by datetime + sorted_sessions = sorted( + sessions, + key=lambda s: s.scan_datetime or datetime.min, + ) + + # Group by date + date_groups: dict[str, list[CBCTSession]] = {} + for s in sorted_sessions: + if s.scan_datetime is None: + date_key = "unknown" + else: + date_key = s.scan_datetime.strftime("%Y-%m-%d") + + if date_key not in date_groups: + date_groups[date_key] = [] + date_groups[date_key].append(s) + + # Assign fraction labels in chronological order + fraction_map: dict[str, list[CBCTSession]] = {} + for fx_idx, date_key in enumerate(sorted(date_groups.keys()), start=1): + fx_label = f"FX{fx_idx}" + fraction_map[fx_label] = date_groups[date_key] + + return fraction_map + + # ----- Directory creation ----- + + def create_learn_structure( + self, + fraction_map: dict[str, list[CBCTSession]], + trajectory_fx_labels: list[str] | None = None, + ) -> Path: + """Create the full LEARN directory tree. + + Parameters + ---------- + fraction_map : dict + Fraction label → list of sessions (for Patient Images). + trajectory_fx_labels : list[str], optional + Fraction labels (e.g. ``["FX01", "FX02"]``) for Trajectory Logs + directories. If *None*, trajectory dirs are not created. + + Returns the site root path. + """ + site_root = self.output_base / self.site_name + + # Patient Files + patient_files = site_root / "Patient Files" / self.anon_id + patient_files.mkdir(parents=True, exist_ok=True) + + # Patient Plans + plans_root = site_root / "Patient Plans" / self.anon_id + for subdir in ("CT", "Plan", "Dose", "Structure Set"): + (plans_root / subdir).mkdir(parents=True, exist_ok=True) + + # Ground Truth + gt_root = site_root / "Ground Truth" / self.anon_id + gt_root.mkdir(parents=True, exist_ok=True) + + # Patient Images — per fraction + images_root = site_root / "Patient Images" / self.anon_id + for fx_label, sessions in fraction_map.items(): + fx_path = images_root / fx_label + + # Count CBCT/KIM-Learning sessions for numbering + cbct_sessions = [ + s for s in sessions if s.session_type in ("cbct", "kim_learning") + ] + cbct_sessions.sort(key=lambda s: s.scan_datetime or datetime.min) + + for cbct_idx, _session in enumerate(cbct_sessions, start=1): + cbct_path = fx_path / "CBCT" / f"CBCT{cbct_idx}" + (cbct_path / "CBCT Projections" / "CDOG").mkdir(parents=True, exist_ok=True) + (cbct_path / "CBCT Projections" / "IPS").mkdir(parents=True, exist_ok=True) + (cbct_path / "Reconstructed CBCT").mkdir(parents=True, exist_ok=True) + (cbct_path / "Registration file").mkdir(parents=True, exist_ok=True) + + # KIM-KV directory (always created per fraction) + (fx_path / "KIM-KV").mkdir(parents=True, exist_ok=True) + + # Trajectory Logs — per fraction label + if trajectory_fx_labels: + for fx_label in trajectory_fx_labels: + traj_fx = site_root / "Trajectory Logs" / self.anon_id / fx_label + (traj_fx / "Trajectory Logs").mkdir(parents=True, exist_ok=True) + (traj_fx / "Treatment Records").mkdir(parents=True, exist_ok=True) + + return site_root + + # ----- File copying ----- + + def copy_cbct_files(self, session: CBCTSession, cbct_path: Path) -> dict: + """Copy CBCT/KIM-Learning files to the LEARN structure (raw, no anonymisation). + + Returns ``{"his": N, "scan": M, "rps": K, "frames_xml": 0|1, "ini": P}``. + """ + counts = {"his": 0, "scan": 0, "rps": 0, "frames_xml": 0, "ini": 0} + + # .his → CBCT Projections/IPS/ + ips_dir = cbct_path / "CBCT Projections" / "IPS" + ips_dir.mkdir(parents=True, exist_ok=True) + for his_file in sorted(session.img_dir.glob("*.his")): + shutil.copy2(his_file, ips_dir / his_file.name) + counts["his"] += 1 + + # _Frames.xml → CBCT Projections/IPS/_Frames.xml (raw copy) + frames_xml = session.img_dir / "_Frames.xml" + if frames_xml.exists(): + shutil.copy2(frames_xml, ips_dir / "_Frames.xml") + counts["frames_xml"] = 1 + + # Reconstruction/ → Reconstructed CBCT/ + recon_dest = cbct_path / "Reconstructed CBCT" + recon_dest.mkdir(parents=True, exist_ok=True) + recon_src = session.img_dir / "Reconstruction" + if recon_src.is_dir(): + for recon_file in sorted(recon_src.iterdir()): + if ".SCAN" in recon_file.name.upper(): + shutil.copy2(recon_file, recon_dest / recon_file.name) + counts["scan"] += 1 + elif recon_file.name.upper().endswith((".INI", ".INI.XVI")): + shutil.copy2(recon_file, recon_dest / recon_file.name) + counts["ini"] += 1 + + # RPS → Registration file/ (raw copy) + if session.rps_path and session.rps_path.exists(): + reg_dest = cbct_path / "Registration file" + reg_dest.mkdir(parents=True, exist_ok=True) + shutil.copy2(session.rps_path, reg_dest / session.rps_path.name) + counts["rps"] += 1 + + return counts + + def copy_motionview_files(self, session: CBCTSession, fx_path: Path) -> dict: + """Copy MotionView .his files to KIM-KV/{img_dirname}/. + + Returns ``{"his": N, "frames_xml": 0|1}``. + """ + dest = fx_path / "KIM-KV" / session.img_dir.name + dest.mkdir(parents=True, exist_ok=True) + counts = {"his": 0, "frames_xml": 0} + for his_file in sorted(session.img_dir.glob("*.his")): + shutil.copy2(his_file, dest / his_file.name) + counts["his"] += 1 + + # _Frames.xml → KIM-KV/{img_dir}/_Frames.xml (raw copy) + frames_xml = session.img_dir / "_Frames.xml" + if frames_xml.exists(): + shutil.copy2(frames_xml, dest / "_Frames.xml") + counts["frames_xml"] = 1 + + return counts + + def copy_anonymised_plans( + self, + anon_ct_dir: Path = None, + anon_plan_dir: Path = None, + anon_struct_dir: Path = None, + anon_dose_dir: Path = None, + ) -> dict: + """Copy anonymised DICOM files to the LEARN structure. + + Returns ``{"ct_count": N, "plan_count": M, "structures_count": P, "dose_count": Q}``. + """ + site_root = self.output_base / self.site_name + plans_root = site_root / "Patient Plans" / self.anon_id + counts = { + "ct_count": 0, + "plan_count": 0, + "structures_count": 0, + "dose_count": 0, + } + + mapping = [ + (anon_ct_dir, "CT", "ct_count"), + (anon_plan_dir, "Plan", "plan_count"), + (anon_struct_dir, "Structure Set", "structures_count"), + (anon_dose_dir, "Dose", "dose_count"), + ] + + for src_dir, dest_name, count_key in mapping: + if src_dir is None: + continue + src_dir = Path(src_dir) + if not src_dir.is_dir(): + continue + dest = plans_root / dest_name + dest.mkdir(parents=True, exist_ok=True) + for f in sorted(src_dir.rglob("*")): + if f.is_file(): + shutil.copy2(f, dest / f.name) + counts[count_key] += 1 + + return counts + + # ----- Centroid file ----- + + def copy_centroid_file(self, centroid_path: Path) -> Path: + """Copy a centroid file to Patient Files/{anon_id}/ (raw, no anonymisation). + + Anonymisation is handled separately by the anonymise module. + + Returns the output file path. + """ + centroid_path = Path(centroid_path) + + site_root = self.output_base / self.site_name + dest_dir = site_root / "Patient Files" / self.anon_id + dest_dir.mkdir(parents=True, exist_ok=True) + output_path = dest_dir / centroid_path.name + + shutil.copy2(centroid_path, output_path) + logger.info("Copied centroid -> %s", output_path.name) + return output_path + + # ----- Trajectory logs ----- + + def copy_trajectory_logs(self, trajectory_base_dir: Path) -> dict: + """Copy KIM trajectory log files to the LEARN Trajectory Logs structure (raw). + + Auto-discovers FX## directories under *trajectory_base_dir* and copies + all files as-is. Anonymisation is handled separately by the anonymise + module. + + Returns ``{"fx_count": N, "files_copied": M}``. + """ + trajectory_base_dir = Path(trajectory_base_dir) + site_root = self.output_base / self.site_name + + # Auto-discover FX## directories + fx_dirs = sorted( + d for d in trajectory_base_dir.iterdir() + if d.is_dir() and re.match(r"FX\d+", d.name, re.IGNORECASE) + ) + + counts = {"fx_count": 0, "files_copied": 0} + + for fx_dir in fx_dirs: + fx_label = fx_dir.name # e.g. "FX01" + dest_traj = ( + site_root / "Trajectory Logs" / self.anon_id + / fx_label / "Trajectory Logs" + ) + dest_traj.mkdir(parents=True, exist_ok=True) + # Also create Treatment Records sibling + dest_treat = ( + site_root / "Trajectory Logs" / self.anon_id + / fx_label / "Treatment Records" + ) + dest_treat.mkdir(parents=True, exist_ok=True) + + counts["fx_count"] += 1 + + for f in sorted(fx_dir.iterdir()): + if not f.is_file(): + continue + shutil.copy2(f, dest_traj / f.name) + counts["files_copied"] += 1 + + logger.info("Trajectory logs copied: %s", counts) + return counts + + # ----- Execute ----- + + def execute( + self, + centroid_path: Path = None, + trajectory_base_dir: Path = None, + dry_run: bool = False, + progress_callback=None, + ) -> dict: + """Run the full folder mapping pipeline (raw copy, no anonymisation). + + Parameters + ---------- + centroid_path : Path, optional + Path to a centroid file to copy. + trajectory_base_dir : Path, optional + Base directory containing FX## trajectory log folders. + dry_run : bool + If True, create directories but skip file copies. + + Returns + ------- + dict + Summary with keys: sessions, fractions, files_copied, dry_run. + """ + # 1. Discover + sessions = self.discover_sessions() + logger.info("Discovered %d sessions", len(sessions)) + + # 2. Match MotionView dates + dated = [s for s in sessions if s.scan_datetime is not None] + undated = [s for s in sessions if s.scan_datetime is None] + if undated: + self._match_motionview_dates(dated, undated) + + # 3. Assign fractions + fraction_map = self.assign_fractions(sessions) + logger.info("Assigned %d fractions", len(fraction_map)) + + # 4. Discover trajectory FX labels for directory creation + trajectory_fx_labels = None + if trajectory_base_dir: + trajectory_base_dir = Path(trajectory_base_dir) + if trajectory_base_dir.is_dir(): + trajectory_fx_labels = sorted( + d.name for d in trajectory_base_dir.iterdir() + if d.is_dir() and re.match(r"FX\d+", d.name, re.IGNORECASE) + ) + + # 5. Create directory structure + site_root = self.create_learn_structure(fraction_map, trajectory_fx_labels) + + summary = { + "sessions": len(sessions), + "fractions": len(fraction_map), + "files_copied": { + "his": 0, "scan": 0, "rps": 0, "ini": 0, + "motionview": 0, "frames_xml": 0, + }, + "dry_run": dry_run, + } + + if dry_run: + logger.info("Dry run — directories created, no files copied") + return summary + + def _progress(current: int, total: int, msg: str) -> None: + if progress_callback: + progress_callback(current, total, msg) + + # 6. Copy files + total_sessions = len(sessions) + completed_sessions = 0 + images_root = site_root / "Patient Images" / self.anon_id + for fx_label, sessions_in_fx in fraction_map.items(): + fx_path = images_root / fx_label + + cbct_sessions = [ + s for s in sessions_in_fx if s.session_type in ("cbct", "kim_learning") + ] + cbct_sessions.sort(key=lambda s: s.scan_datetime or datetime.min) + + for cbct_idx, session in enumerate(cbct_sessions, start=1): + cbct_path = fx_path / "CBCT" / f"CBCT{cbct_idx}" + _progress(completed_sessions, total_sessions, f"{fx_label}/CBCT{cbct_idx}") + counts = self.copy_cbct_files(session, cbct_path) + summary["files_copied"]["his"] += counts["his"] + summary["files_copied"]["scan"] += counts["scan"] + summary["files_copied"]["rps"] += counts["rps"] + summary["files_copied"]["ini"] += counts["ini"] + summary["files_copied"]["frames_xml"] += counts["frames_xml"] + completed_sessions += 1 + + mv_sessions = [ + s for s in sessions_in_fx if s.session_type == "motionview" + ] + for session in mv_sessions: + _progress(completed_sessions, total_sessions, f"{fx_label}/KIM-KV") + mv_counts = self.copy_motionview_files(session, fx_path) + summary["files_copied"]["motionview"] += mv_counts["his"] + summary["files_copied"]["frames_xml"] += mv_counts["frames_xml"] + completed_sessions += 1 + + _progress(total_sessions, total_sessions, "File copy complete") + + # 7. Copy centroid file + if centroid_path: + centroid_path = Path(centroid_path) + if centroid_path.exists(): + self.copy_centroid_file(centroid_path) + summary["files_copied"]["centroid"] = 1 + + # 8. Copy trajectory logs + if trajectory_base_dir and Path(trajectory_base_dir).is_dir(): + traj_counts = self.copy_trajectory_logs(trajectory_base_dir) + summary["files_copied"]["trajectory"] = traj_counts["files_copied"] + + logger.info("Execute complete: %s", summary) + return summary diff --git a/learn_upload/gui_qt.py b/learn_upload/gui_qt.py new file mode 100644 index 0000000..a37fea1 --- /dev/null +++ b/learn_upload/gui_qt.py @@ -0,0 +1,1587 @@ +"""PyQt6 desktop GUI for the LEARN data transfer pipeline. + +Provides a 6-step wizard wrapping the existing learn_upload modules: + 1. Configuration -- paths, anon ID, PII strings + 2. Data Preview -- session discovery and fraction assignment + 3. Anonymise -- DICOM anonymisation with per-file progress + 4. Folder Sort -- copy files into LEARN directory structure + 5. PII Verification -- scan output for residual patient data + 6. CBCT Shift Report -- generate markdown report of CBCT shifts + +Usage: + python -m learn_upload.gui_qt +""" + +import ctypes +import faulthandler +import json +import logging +import sys +from dataclasses import asdict +from datetime import datetime +from pathlib import Path +from typing import Optional + +from PyQt6.QtCore import ( + QObject, + Qt, + QThread, + pyqtSignal, +) +from PyQt6.QtGui import QFont +from PyQt6.QtWidgets import ( + QApplication, + QCheckBox, + QFileDialog, + QFormLayout, + QFrame, + QHBoxLayout, + QHeaderView, + QLabel, + QLineEdit, + QMainWindow, + QMessageBox, + QProgressBar, + QPushButton, + QScrollArea, + QStackedWidget, + QTableWidget, + QTableWidgetItem, + QTextEdit, + QVBoxLayout, + QWidget, +) + +from learn_upload.anonymise_dicom import anonymise_output_folder +from learn_upload.config import DEFAULT_LEARN_OUTPUT, setup_logging +from learn_upload.folder_sort import LearnFolderMapper +from learn_upload.verify_pii import verify_no_pii + +logger = logging.getLogger(__name__) + +CONFIG_FILE = Path.home() / ".learn_pipeline_config.json" + +# --------------------------------------------------------------------------- +# Dark theme stylesheet +# --------------------------------------------------------------------------- + +DARK_QSS = """ +QMainWindow, QWidget { + background-color: #0f1117; + color: #e2e8f0; + font-family: 'Segoe UI', system-ui, sans-serif; + font-size: 13px; +} +QLabel { + color: #e2e8f0; + background: transparent; +} +QLabel[class="heading"] { + font-size: 18px; + font-weight: bold; +} +QLabel[class="section"] { + font-size: 13px; + font-weight: bold; + color: #94a3b8; + text-transform: uppercase; +} +QLabel[class="muted"] { + color: #94a3b8; + font-size: 12px; +} +QLineEdit { + background-color: #1a2030; + border: 1px solid #2a3040; + border-radius: 5px; + color: #e2e8f0; + padding: 7px 10px; + font-family: 'Cascadia Code', 'Consolas', monospace; + font-size: 12px; + selection-background-color: #6c63ff; +} +QLineEdit:focus { + border-color: #6c63ff; +} +QLineEdit:disabled { + color: #5c6578; + background-color: #151820; +} +QPushButton { + background-color: #1a1d27; + border: 1px solid #2a3040; + border-radius: 5px; + color: #94a3b8; + padding: 8px 16px; + font-weight: bold; + font-size: 13px; +} +QPushButton:hover { + border-color: #6c63ff; + color: #e2e8f0; +} +QPushButton:pressed { + background-color: #252838; +} +QPushButton:disabled { + color: #3a3f50; + border-color: #1e2230; +} +QPushButton[class="primary"] { + background-color: #6c63ff; + color: #ffffff; + border: none; +} +QPushButton[class="primary"]:hover { + background-color: #5b53e0; +} +QPushButton[class="primary"]:disabled { + background-color: #3a3660; + color: #6c6880; +} +QCheckBox { + color: #e2e8f0; + spacing: 8px; +} +QCheckBox::indicator { + width: 16px; + height: 16px; + border: 1px solid #2a3040; + border-radius: 3px; + background: #1a2030; +} +QCheckBox::indicator:checked { + background-color: #6c63ff; + border-color: #6c63ff; +} +QProgressBar { + background-color: #1a2030; + border: none; + border-radius: 3px; + height: 8px; + text-align: center; + color: transparent; +} +QProgressBar::chunk { + background-color: #6c63ff; + border-radius: 3px; +} +QTableWidget { + background-color: #1a1d27; + border: 1px solid #2a3040; + border-radius: 5px; + gridline-color: #2a3040; + color: #e2e8f0; + font-family: 'Cascadia Code', 'Consolas', monospace; + font-size: 12px; + selection-background-color: rgba(108, 99, 255, 0.2); +} +QTableWidget::item { + padding: 6px 10px; +} +QHeaderView::section { + background-color: #11151c; + color: #94a3b8; + border: none; + border-bottom: 1px solid #2a3040; + border-right: 1px solid #2a3040; + padding: 8px 10px; + font-weight: bold; + font-size: 11px; + text-transform: uppercase; +} +QTextEdit { + background-color: #050810; + border: 1px solid #2a3040; + border-radius: 5px; + color: #e2e8f0; + font-family: 'Cascadia Code', 'Consolas', monospace; + font-size: 12px; + padding: 8px; + selection-background-color: rgba(108, 99, 255, 0.3); +} +QScrollBar:vertical { + background: transparent; + width: 8px; + margin: 0; +} +QScrollBar::handle:vertical { + background: #2a3040; + border-radius: 4px; + min-height: 30px; +} +QScrollBar::handle:vertical:hover { + background: #3a4050; +} +QScrollBar::add-line:vertical, QScrollBar::sub-line:vertical, +QScrollBar::add-page:vertical, QScrollBar::sub-page:vertical { + background: none; + height: 0; +} +QScrollBar:horizontal { + background: transparent; + height: 8px; +} +QScrollBar::handle:horizontal { + background: #2a3040; + border-radius: 4px; + min-width: 30px; +} +QFrame[class="sidebar"] { + background-color: #11151c; + border-right: 1px solid #2a3040; +} +QFrame[class="card"] { + background-color: #1a1d27; + border: 1px solid #2a3040; + border-radius: 6px; +} +QFrame[class="separator"] { + background-color: #2a3040; + max-height: 1px; +} +""" + + +# --------------------------------------------------------------------------- +# QThread workers +# --------------------------------------------------------------------------- + +class DiscoveryWorker(QThread): + progress = pyqtSignal(str) + finished = pyqtSignal(dict) + error = pyqtSignal(str) + + def __init__(self, mapper: LearnFolderMapper): + super().__init__() + self.mapper = mapper + + def run(self): + try: + self.progress.emit("Discovering sessions...") + sessions = self.mapper.discover_sessions(enrich=True) + + serialized = [] + for s in sessions: + d = asdict(s) + for key, val in d.items(): + if isinstance(val, Path): + d[key] = str(val) + elif isinstance(val, datetime): + d[key] = val.isoformat() + serialized.append(d) + + self.finished.emit({ + "ok": True, + "session_count": len(sessions), + "sessions": serialized, + }) + except Exception as exc: + logger.exception("Discovery failed") + self.error.emit(str(exc)) + + +class AnonymiseWorker(QThread): + progress = pyqtSignal(int, int, str) # current, total, filename + finished = pyqtSignal(dict) + error = pyqtSignal(str) + + def __init__(self, config: dict): + super().__init__() + self.config = config + + def run(self): + cfg = self.config + try: + output_dir = Path(cfg["output_path"]) + site_name = cfg["site_name"].strip() + anon_id = cfg["anon_id"].strip() + patient_dir = Path(cfg["source_path"]) + tps_path = cfg.get("tps_path", "").strip() + + result = anonymise_output_folder( + output_dir=output_dir, + anon_id=anon_id, + site_name=site_name, + patient_dir=patient_dir, + tps_path=Path(tps_path) if tps_path else None, + progress_callback=lambda cur, tot, name: self.progress.emit(cur, tot, name), + ) + self.finished.emit(result) + except Exception as exc: + logger.exception("Anonymisation failed") + self.error.emit(str(exc)) + + +class FolderSortWorker(QThread): + progress = pyqtSignal(int, int, str) # current, total, description + finished = pyqtSignal(dict) + error = pyqtSignal(str) + + def __init__(self, mapper: LearnFolderMapper, config: dict): + super().__init__() + self.mapper = mapper + self.config = config + + def run(self): + cfg = self.config + try: + centroid_path = cfg.get("centroid_path", "") + trajectory_dir = cfg.get("trajectory_dir", "") + dry_run = cfg.get("dry_run", False) + + summary = self.mapper.execute( + centroid_path=Path(centroid_path) if centroid_path else None, + trajectory_base_dir=Path(trajectory_dir) if trajectory_dir else None, + dry_run=dry_run, + progress_callback=lambda cur, tot, msg: self.progress.emit(cur, tot, msg), + ) + self.finished.emit(summary) + except Exception as exc: + logger.exception("Folder sort failed") + self.error.emit(str(exc)) + + +class PiiCheckWorker(QThread): + finished = pyqtSignal(dict) + error = pyqtSignal(str) + + def __init__(self, config: dict): + super().__init__() + self.config = config + + def run(self): + cfg = self.config + try: + pii_strings = [s.strip() for s in cfg.get("pii_strings", []) if s.strip()] + + # Auto-detect original MRN from source directory name + source_path = cfg.get("source_path", "") + if source_path: + dir_name = Path(source_path).name + if dir_name.lower().startswith("patient_"): + mrn = dir_name[len("patient_"):] + if mrn and mrn not in pii_strings: + pii_strings.append(mrn) + logger.info("Auto-detected MRN from source path") + + if not pii_strings: + self.finished.emit({ + "passed": True, + "skipped": True, + "files_scanned": 0, + "findings": [], + }) + return + + output_base = Path(cfg["output_path"]) + site_name = cfg["site_name"].strip() + scan_dir = output_base / site_name + + if not scan_dir.is_dir(): + scan_dir = output_base + + findings = verify_no_pii(scan_dir, pii_strings) + + serialized = [] + for f in findings: + serialized.append({ + "file": str(f["file"]), + "location": f["location"], + "matched": f["matched"], + }) + + self.finished.emit({ + "passed": len(findings) == 0, + "files_scanned": len(list(scan_dir.rglob("*"))), + "findings": serialized, + }) + except Exception as exc: + logger.exception("PII verification failed") + self.error.emit(str(exc)) + + +class ReportWorker(QThread): + finished = pyqtSignal(str) + error = pyqtSignal(str) + + def __init__(self, patient_images_path: str): + super().__init__() + self.patient_images_path = patient_images_path + + def run(self): + try: + # Import from cbct-shifts directory + cbct_shifts_dir = Path(__file__).resolve().parent.parent / "cbct-shifts" + if str(cbct_shifts_dir) not in sys.path: + sys.path.insert(0, str(cbct_shifts_dir)) + from report_patient_details import generate_report + + report = generate_report(Path(self.patient_images_path)) + if report is None: + self.error.emit("No RPS files found -- cannot generate report.") + else: + self.finished.emit(report) + except Exception as exc: + logger.exception("Report generation failed") + self.error.emit(str(exc)) + + +# --------------------------------------------------------------------------- +# Logging handler that forwards to a QTextEdit widget +# --------------------------------------------------------------------------- + +class _LogSignalBridge(QObject): + """Thread-safe bridge: emits a signal so QTextEdit.append runs on the main thread.""" + log_message = pyqtSignal(str) + + +class QtLogHandler(logging.Handler): + """Routes log records to a QTextEdit terminal widget (thread-safe via signal).""" + + def __init__(self, text_edit: QTextEdit): + super().__init__() + self._bridge = _LogSignalBridge() + self._bridge.log_message.connect(text_edit.append) + + def emit(self, record: logging.LogRecord) -> None: + try: + msg = self.format(record) + level = record.levelname.lower() + if level == "error": + color = "#ef4444" + elif level == "warning": + color = "#f59e0b" + else: + color = "#94a3b8" + html = f'{_esc(msg)}' + self._bridge.log_message.emit(html) + except Exception: + pass + + +def _esc(text: str) -> str: + return text.replace("&", "&").replace("<", "<").replace(">", ">") + + +# --------------------------------------------------------------------------- +# Sidebar step indicator +# --------------------------------------------------------------------------- + +class StepIndicator(QWidget): + """A single step label in the left sidebar. Clickable when completed.""" + + clicked = pyqtSignal(int) # emits step index when clicked + + def __init__(self, number: int, label: str, parent=None): + super().__init__(parent) + self.number = number + self.label_text = label + self._state = "future" # "future" | "active" | "completed" + + layout = QHBoxLayout(self) + layout.setContentsMargins(20, 10, 20, 10) + layout.setSpacing(12) + + self.circle = QLabel(str(number)) + self.circle.setFixedSize(28, 28) + self.circle.setAlignment(Qt.AlignmentFlag.AlignCenter) + self.circle.setFont(QFont("Segoe UI", 10, QFont.Weight.Bold)) + layout.addWidget(self.circle) + + self.label = QLabel(label) + self.label.setFont(QFont("Segoe UI", 12)) + layout.addWidget(self.label) + layout.addStretch() + + self.set_state("future") + + def mousePressEvent(self, event): + if self._state == "completed": + self.clicked.emit(self.number - 1) # 0-indexed step + super().mousePressEvent(event) + + def set_state(self, state: str): + self._state = state + if state == "active": + self.setCursor(Qt.CursorShape.ArrowCursor) + self.setStyleSheet( + "background: rgba(108, 99, 255, 0.08); " + "border-left: 3px solid #6c63ff;" + ) + self.circle.setStyleSheet( + "background-color: #6c63ff; color: #ffffff; " + "border-radius: 14px; font-weight: bold;" + ) + self.label.setStyleSheet("color: #e2e8f0; font-weight: bold;") + elif state == "completed": + self.setStyleSheet("background: transparent; border-left: 3px solid transparent;") + self.setCursor(Qt.CursorShape.PointingHandCursor) + self.circle.setStyleSheet( + "background-color: #22c55e; color: #ffffff; " + "border-radius: 14px; font-weight: bold;" + ) + self.circle.setText("\u2713") + self.label.setStyleSheet("color: #94a3b8;") + else: + self.setCursor(Qt.CursorShape.ArrowCursor) + self.setStyleSheet("background: transparent; border-left: 3px solid transparent;") + self.circle.setStyleSheet( + "background-color: transparent; color: #5c6578; " + "border: 2px solid #5c6578; border-radius: 14px;" + ) + self.circle.setText(str(self.number)) + self.label.setStyleSheet("color: #5c6578;") + + +# --------------------------------------------------------------------------- +# Step pages +# --------------------------------------------------------------------------- + +class ConfigPage(QWidget): + """Step 1: Configuration form.""" + + def __init__(self, parent=None): + super().__init__(parent) + page_layout = QVBoxLayout(self) + page_layout.setContentsMargins(0, 0, 0, 0) + + scroll = QScrollArea() + scroll.setWidgetResizable(True) + scroll.setFrameShape(QFrame.Shape.NoFrame) + page_layout.addWidget(scroll) + + scroll_content = QWidget() + outer_layout = QVBoxLayout(scroll_content) + outer_layout.setContentsMargins(24, 24, 24, 24) + outer_layout.setAlignment(Qt.AlignmentFlag.AlignHCenter) + scroll.setWidget(scroll_content) + + inner = QWidget() + inner.setMaximumWidth(800) + layout = QVBoxLayout(inner) + layout.setContentsMargins(0, 0, 0, 0) + layout.setSpacing(16) + + outer_layout.addWidget(inner) + + # -- Patient Identity card -- + id_card = self._make_card("Patient Identity") + id_form = QFormLayout() + id_form.setSpacing(10) + + self.anon_id = QLineEdit("PAT01") + self.anon_id.setPlaceholderText("PAT01") + id_form.addRow("Anonymised ID:", self.anon_id) + + self.site_name = QLineEdit() + self.site_name.setPlaceholderText("e.g. Prostate") + id_form.addRow("Site Name:", self.site_name) + + id_card.layout().addLayout(id_form) + layout.addWidget(id_card) + + # -- Data Paths card -- + paths_card = self._make_card("Data Paths") + paths_form = QFormLayout() + paths_form.setSpacing(10) + + self.source_path = self._path_row(paths_form, "Source Path (XVI patient root):", folder=True) + self.tps_path = self._path_row(paths_form, "TPS Export Path:", folder=True) + self.output_path = self._path_row(paths_form, "Output Path:", folder=True) + self.output_path.setText(str(DEFAULT_LEARN_OUTPUT)) + self.staging_path = self._path_row(paths_form, "Staging Path:", folder=True) + self.staging_path.setPlaceholderText("(auto: output/_staging)") + self.images_subdir = QLineEdit("XVI Export") + self.images_subdir.setPlaceholderText("IMAGES") + paths_form.addRow("Images Subdirectory:", self.images_subdir) + self.centroid_path = self._path_row(paths_form, "Centroid File:", folder=False) + self.trajectory_dir = self._path_row(paths_form, "Trajectory Logs Dir:", folder=True) + + paths_card.layout().addLayout(paths_form) + layout.addWidget(paths_card) + + # -- PII Strings card -- + pii_card = self._make_card("PII Search Strings") + pii_hint = QLabel("Comma-separated patient identifiers to scan for after anonymisation.") + pii_hint.setProperty("class", "muted") + pii_hint.setStyleSheet("color: #94a3b8; font-size: 12px; background: transparent;") + pii_card.layout().addWidget(pii_hint) + self.pii_strings = QLineEdit() + self.pii_strings.setPlaceholderText("e.g. Smith, John, 12345678") + pii_card.layout().addWidget(self.pii_strings) + layout.addWidget(pii_card) + + # -- Options card -- + opt_card = self._make_card("Options") + self.dry_run = QCheckBox("Dry run (preview only, no files copied)") + opt_card.layout().addWidget(self.dry_run) + layout.addWidget(opt_card) + + layout.addStretch() + + def _make_card(self, title: str) -> QFrame: + card = QFrame() + card.setProperty("class", "card") + card.setStyleSheet( + "QFrame[class='card'] { background-color: #1a1d27; " + "border: 1px solid #2a3040; border-radius: 6px; padding: 16px; }" + ) + card_layout = QVBoxLayout(card) + card_layout.setContentsMargins(16, 16, 16, 16) + card_layout.setSpacing(12) + heading = QLabel(title) + heading.setStyleSheet( + "font-size: 12px; font-weight: bold; color: #94a3b8; " + "text-transform: uppercase; letter-spacing: 0.5px; background: transparent;" + ) + card_layout.addWidget(heading) + return card + + def _path_row(self, form: QFormLayout, label: str, folder: bool) -> QLineEdit: + row = QWidget() + row_layout = QHBoxLayout(row) + row_layout.setContentsMargins(0, 0, 0, 0) + row_layout.setSpacing(6) + line_edit = QLineEdit() + line_edit.setPlaceholderText("(optional)" if "optional" in label.lower() or "TPS" in label or "Centroid" in label or "Trajectory" in label else "") + row_layout.addWidget(line_edit) + btn = QPushButton("Browse") + btn.setFixedWidth(80) + if folder: + btn.clicked.connect(lambda: self._browse_folder(line_edit)) + else: + btn.clicked.connect(lambda: self._browse_file(line_edit)) + row_layout.addWidget(btn) + form.addRow(label, row) + return line_edit + + def _browse_folder(self, target: QLineEdit): + path = QFileDialog.getExistingDirectory(self, "Select Folder", target.text()) + if path: + target.setText(path) + + def _browse_file(self, target: QLineEdit): + path, _ = QFileDialog.getOpenFileName(self, "Select File", target.text()) + if path: + target.setText(path) + + def get_config(self) -> dict: + pii = [s.strip() for s in self.pii_strings.text().split(",") if s.strip()] + return { + "anon_id": self.anon_id.text().strip(), + "site_name": self.site_name.text().strip(), + "source_path": self.source_path.text().strip(), + "tps_path": self.tps_path.text().strip(), + "output_path": self.output_path.text().strip(), + "staging_path": self.staging_path.text().strip(), + "images_subdir": self.images_subdir.text().strip() or "IMAGES", + "centroid_path": self.centroid_path.text().strip(), + "trajectory_dir": self.trajectory_dir.text().strip(), + "pii_strings": pii, + "dry_run": self.dry_run.isChecked(), + } + + def validate(self) -> Optional[str]: + cfg = self.get_config() + if not cfg["anon_id"]: + return "Anonymised ID is required." + if not cfg["anon_id"].startswith("PAT") or not cfg["anon_id"][3:].isdigit(): + return "Anon ID must match PATxx format (e.g. PAT01)." + if not cfg["site_name"]: + return "Site Name is required." + if not cfg["source_path"]: + return "Source Path is required." + if not cfg["output_path"]: + return "Output Path is required." + return None + + def set_config(self, cfg: dict) -> None: + """Pre-populate form fields from a config dict.""" + self.anon_id.setText(cfg.get("anon_id", "")) + self.site_name.setText(cfg.get("site_name", "")) + self.source_path.setText(cfg.get("source_path", "")) + self.tps_path.setText(cfg.get("tps_path", "")) + self.output_path.setText(cfg.get("output_path", "")) + self.staging_path.setText(cfg.get("staging_path", "")) + self.images_subdir.setText(cfg.get("images_subdir", "XVI Export")) + self.centroid_path.setText(cfg.get("centroid_path", "")) + self.trajectory_dir.setText(cfg.get("trajectory_dir", "")) + pii = cfg.get("pii_strings", []) + if isinstance(pii, list): + self.pii_strings.setText(", ".join(pii)) + else: + self.pii_strings.setText(str(pii)) + self.dry_run.setChecked(bool(cfg.get("dry_run", False))) + + +class PreviewPage(QWidget): + """Step 2: Data Preview -- shows discovered sessions in a table.""" + + def __init__(self, parent=None): + super().__init__(parent) + layout = QVBoxLayout(self) + layout.setContentsMargins(24, 24, 24, 24) + layout.setSpacing(16) + + # Stats row + self.stats_layout = QHBoxLayout() + self.stats_layout.setSpacing(12) + layout.addLayout(self.stats_layout) + + # Table + self.table = QTableWidget() + self.table.setColumnCount(7) + self.table.setHorizontalHeaderLabels([ + "Type", "Directory", "Datetime", + "Treatment", "kV", "mA", "RPS", + ]) + self.table.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch) + self.table.setEditTriggers(QTableWidget.EditTrigger.NoEditTriggers) + self.table.setSelectionBehavior(QTableWidget.SelectionBehavior.SelectRows) + self.table.setAlternatingRowColors(False) + self.table.verticalHeader().setVisible(False) + layout.addWidget(self.table) + + # Status label (shown during loading) + self.status = QLabel("Discovering sessions...") + self.status.setAlignment(Qt.AlignmentFlag.AlignCenter) + self.status.setStyleSheet("color: #94a3b8; font-size: 14px; padding: 40px;") + layout.addWidget(self.status) + + def set_loading(self, loading: bool): + self.table.setVisible(not loading) + self.status.setVisible(loading) + if loading: + self.status.setText("Discovering sessions...") + + def populate(self, data: dict): + self.set_loading(False) + + # Clear old stat cards + while self.stats_layout.count(): + item = self.stats_layout.takeAt(0) + if item.widget(): + item.widget().deleteLater() + + self.stats_layout.addWidget(self._stat_card(str(data["session_count"]), "Sessions")) + + # Fill table + sessions = data.get("sessions", []) + self.table.setRowCount(len(sessions)) + for row, s in enumerate(sessions): + self.table.setItem(row, 0, QTableWidgetItem(s.get("session_type", ""))) + img_dir = s.get("img_dir", "") + dir_name = Path(img_dir).name if img_dir else "" + self.table.setItem(row, 1, QTableWidgetItem(dir_name)) + dt = s.get("scan_datetime") + dt_str = str(dt).replace("T", " ")[:19] if dt else "-" + self.table.setItem(row, 2, QTableWidgetItem(dt_str)) + self.table.setItem(row, 3, QTableWidgetItem(s.get("treatment_id") or "-")) + kv = s.get("tube_kv") + self.table.setItem(row, 4, QTableWidgetItem(str(kv) if kv is not None else "-")) + ma = s.get("tube_ma") + self.table.setItem(row, 5, QTableWidgetItem(str(ma) if ma is not None else "-")) + self.table.setItem(row, 6, QTableWidgetItem("Yes" if s.get("has_rps") else "-")) + + def _stat_card(self, value: str, label: str, color: str = "#6c63ff") -> QFrame: + card = QFrame() + card.setStyleSheet( + "background-color: #1a1d27; border: 1px solid #2a3040; " + "border-radius: 6px; padding: 12px;" + ) + card_layout = QVBoxLayout(card) + card_layout.setContentsMargins(12, 12, 12, 12) + card_layout.setSpacing(4) + val_label = QLabel(value) + val_label.setStyleSheet( + f"font-size: 24px; font-weight: bold; color: {color}; " + f"font-family: 'Cascadia Code', 'Consolas', monospace; background: transparent;" + ) + card_layout.addWidget(val_label) + name_label = QLabel(label) + name_label.setStyleSheet( + "font-size: 11px; color: #94a3b8; text-transform: uppercase; " + "letter-spacing: 0.5px; background: transparent;" + ) + card_layout.addWidget(name_label) + return card + + +class ProgressPage(QWidget): + """Reusable step page with progress bar, status label, and log terminal.""" + + def __init__(self, title: str = "", parent=None): + super().__init__(parent) + layout = QVBoxLayout(self) + layout.setContentsMargins(24, 24, 24, 24) + layout.setSpacing(16) + + # Error banner (hidden by default) + self.error_banner = QLabel() + self.error_banner.setStyleSheet( + "background: rgba(239, 68, 68, 0.1); border: 1px solid #ef4444; " + "border-radius: 6px; padding: 12px; color: #ef4444; " + "font-family: 'Cascadia Code', monospace; font-size: 13px;" + ) + self.error_banner.setWordWrap(True) + self.error_banner.hide() + layout.addWidget(self.error_banner) + + # Stats row + self.stats_layout = QHBoxLayout() + self.stats_layout.setSpacing(12) + layout.addLayout(self.stats_layout) + + # Progress section + self.progress_label = QLabel("Preparing...") + self.progress_label.setStyleSheet("color: #94a3b8; font-size: 12px; background: transparent;") + layout.addWidget(self.progress_label) + + self.progress_bar = QProgressBar() + self.progress_bar.setRange(0, 100) + self.progress_bar.setValue(0) + layout.addWidget(self.progress_bar) + + # Log terminal + terminal_heading = QLabel("Log Output") + terminal_heading.setStyleSheet( + "font-size: 12px; font-weight: bold; color: #94a3b8; " + "text-transform: uppercase; letter-spacing: 0.5px; background: transparent;" + ) + layout.addWidget(terminal_heading) + + self.terminal = QTextEdit() + self.terminal.setReadOnly(True) + self.terminal.setMinimumHeight(180) + layout.addWidget(self.terminal) + + layout.addStretch() + + def reset(self): + self.error_banner.hide() + self.progress_label.setText("Preparing...") + self.progress_bar.setRange(0, 100) + self.progress_bar.setValue(0) + self.terminal.clear() + self._clear_stats() + + def set_indeterminate(self, text: str = "Running..."): + self.progress_bar.setRange(0, 0) # indeterminate mode + self.progress_label.setText(text) + + def set_progress(self, current: int, total: int, text: str = ""): + self.progress_bar.setRange(0, total) + self.progress_bar.setValue(current) + pct = round(current / total * 100) if total > 0 else 0 + self.progress_label.setText(f"{text} ({pct}%)" if text else f"{pct}%") + + def set_complete(self, text: str = "Complete"): + self.progress_bar.setRange(0, 100) + self.progress_bar.setValue(100) + self.progress_label.setText(text) + + def show_error(self, message: str): + self.error_banner.setText(message) + self.error_banner.show() + + def add_stat(self, value: str, label: str, color: str = "#6c63ff"): + card = QFrame() + card.setStyleSheet( + "background-color: #1a1d27; border: 1px solid #2a3040; " + "border-radius: 6px; padding: 12px;" + ) + cl = QVBoxLayout(card) + cl.setContentsMargins(12, 12, 12, 12) + cl.setSpacing(4) + vl = QLabel(value) + vl.setStyleSheet( + f"font-size: 24px; font-weight: bold; color: {color}; " + f"font-family: 'Cascadia Code', monospace; background: transparent;" + ) + cl.addWidget(vl) + nl = QLabel(label) + nl.setStyleSheet( + "font-size: 11px; color: #94a3b8; text-transform: uppercase; " + "letter-spacing: 0.5px; background: transparent;" + ) + cl.addWidget(nl) + self.stats_layout.addWidget(card) + + def _clear_stats(self): + while self.stats_layout.count(): + item = self.stats_layout.takeAt(0) + if item.widget(): + item.widget().deleteLater() + + +class PiiResultPage(QWidget): + """Step 5: PII Verification results.""" + + def __init__(self, parent=None): + super().__init__(parent) + layout = QVBoxLayout(self) + layout.setContentsMargins(24, 24, 24, 24) + layout.setSpacing(16) + + # Banner + self.banner = QLabel() + self.banner.setAlignment(Qt.AlignmentFlag.AlignCenter) + self.banner.setStyleSheet("font-size: 16px; font-weight: bold; padding: 16px;") + layout.addWidget(self.banner) + + # Stats row + self.stats_layout = QHBoxLayout() + self.stats_layout.setSpacing(12) + layout.addLayout(self.stats_layout) + + # Findings table + self.table = QTableWidget() + self.table.setColumnCount(3) + self.table.setHorizontalHeaderLabels(["File", "Location", "Matched"]) + self.table.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch) + self.table.setEditTriggers(QTableWidget.EditTrigger.NoEditTriggers) + self.table.verticalHeader().setVisible(False) + self.table.hide() + layout.addWidget(self.table) + + # Status (loading) + self.status = QLabel("Scanning for PII...") + self.status.setAlignment(Qt.AlignmentFlag.AlignCenter) + self.status.setStyleSheet("color: #94a3b8; font-size: 14px; padding: 40px;") + layout.addWidget(self.status) + + layout.addStretch() + + def set_loading(self, loading: bool): + self.status.setVisible(loading) + self.banner.setVisible(not loading) + if loading: + self.status.setText("Scanning for PII...") + self.table.hide() + + def populate(self, data: dict): + self.set_loading(False) + + # Clear old stat cards + while self.stats_layout.count(): + item = self.stats_layout.takeAt(0) + if item.widget(): + item.widget().deleteLater() + + if data.get("skipped"): + self.banner.setText("SKIPPED -- No PII search strings configured") + self.banner.setStyleSheet( + "background: rgba(245, 158, 11, 0.1); border: 1px solid #f59e0b; " + "border-radius: 6px; padding: 16px; color: #f59e0b; " + "font-size: 16px; font-weight: bold;" + ) + elif data.get("passed"): + self.banner.setText("PASS -- No residual PII detected") + self.banner.setStyleSheet( + "background: rgba(34, 197, 94, 0.1); border: 1px solid #22c55e; " + "border-radius: 6px; padding: 16px; color: #22c55e; " + "font-size: 16px; font-weight: bold;" + ) + else: + count = len(data.get("findings", [])) + self.banner.setText(f"FAIL -- {count} PII finding(s)") + self.banner.setStyleSheet( + "background: rgba(239, 68, 68, 0.1); border: 1px solid #ef4444; " + "border-radius: 6px; padding: 16px; color: #ef4444; " + "font-size: 16px; font-weight: bold;" + ) + + self.stats_layout.addWidget( + self._stat_card(str(data.get("files_scanned", 0)), "Files Scanned") + ) + findings = data.get("findings", []) + error_color = "#ef4444" if findings else "#6c63ff" + self.stats_layout.addWidget( + self._stat_card(str(len(findings)), "Findings", error_color) + ) + + if findings: + self.table.show() + self.table.setRowCount(len(findings)) + for i, f in enumerate(findings): + short_file = "/".join(Path(f["file"]).parts[-3:]) + self.table.setItem(i, 0, QTableWidgetItem(short_file)) + self.table.setItem(i, 1, QTableWidgetItem(f["location"])) + item = QTableWidgetItem(f["matched"]) + item.setForeground(Qt.GlobalColor.red) + self.table.setItem(i, 2, item) + else: + self.table.hide() + + def _stat_card(self, value: str, label: str, color: str = "#6c63ff") -> QFrame: + card = QFrame() + card.setStyleSheet( + "background-color: #1a1d27; border: 1px solid #2a3040; " + "border-radius: 6px; padding: 12px;" + ) + cl = QVBoxLayout(card) + cl.setContentsMargins(12, 12, 12, 12) + cl.setSpacing(4) + vl = QLabel(value) + vl.setStyleSheet( + f"font-size: 24px; font-weight: bold; color: {color}; " + f"font-family: 'Cascadia Code', monospace; background: transparent;" + ) + cl.addWidget(vl) + nl = QLabel(label) + nl.setStyleSheet( + "font-size: 11px; color: #94a3b8; text-transform: uppercase; " + "letter-spacing: 0.5px; background: transparent;" + ) + cl.addWidget(nl) + return card + + +class ReportPage(QWidget): + """Step 6: CBCT Shift Report display.""" + + def __init__(self, parent=None): + super().__init__(parent) + layout = QVBoxLayout(self) + layout.setContentsMargins(24, 24, 24, 24) + layout.setSpacing(16) + + self.status = QLabel("Generating report...") + self.status.setAlignment(Qt.AlignmentFlag.AlignCenter) + self.status.setStyleSheet("color: #94a3b8; font-size: 14px; padding: 40px;") + layout.addWidget(self.status) + + self.text_edit = QTextEdit() + self.text_edit.setReadOnly(True) + self.text_edit.setMinimumHeight(400) + self.text_edit.hide() + layout.addWidget(self.text_edit) + + layout.addStretch() + + def set_loading(self, loading: bool): + self.status.setVisible(loading) + self.text_edit.setVisible(not loading) + if loading: + self.status.setText("Generating report...") + + def set_report(self, markdown: str): + self.set_loading(False) + self.text_edit.setPlainText(markdown) + + def show_error(self, message: str): + self.set_loading(False) + self.text_edit.show() + self.text_edit.setPlainText(f"Error: {message}") + + +# --------------------------------------------------------------------------- +# Main window +# --------------------------------------------------------------------------- + +class LearnPipelineWindow(QMainWindow): + + STEP_NAMES = [ + "Configuration", + "Data Preview", + "Folder Sort", + "Anonymise", + "PII Verification", + "CBCT Shift Report", + ] + + def __init__(self): + super().__init__() + self.setWindowTitle("LEARN Pipeline") + self.setMinimumSize(1000, 700) + self.resize(1200, 820) + + self._config: dict = {} + self._mapper: Optional[LearnFolderMapper] = None + self._current_step = 0 + self._completed_steps: set[int] = set() + self._active_worker: Optional[QThread] = None + self._workers: list[QThread] = [] # prevent GC of running workers + + self._build_ui() + self._load_config() + + def _load_config(self) -> None: + """Load persisted config from JSON file into the config form.""" + try: + if CONFIG_FILE.is_file(): + data = json.loads(CONFIG_FILE.read_text(encoding="utf-8")) + self._config_page.set_config(data) + logger.info("Loaded config from %s", CONFIG_FILE) + except Exception: + logger.warning("Failed to load config from %s", CONFIG_FILE, exc_info=True) + + def _save_config(self, cfg: dict) -> None: + """Persist config dict to JSON file.""" + try: + CONFIG_FILE.write_text( + json.dumps(cfg, indent=2, default=str), encoding="utf-8", + ) + logger.info("Saved config to %s", CONFIG_FILE) + except Exception: + logger.warning("Failed to save config to %s", CONFIG_FILE, exc_info=True) + + def _build_ui(self): + central = QWidget() + self.setCentralWidget(central) + main_layout = QHBoxLayout(central) + main_layout.setContentsMargins(0, 0, 0, 0) + main_layout.setSpacing(0) + + # -- Sidebar -- + sidebar = QFrame() + sidebar.setFixedWidth(260) + sidebar.setProperty("class", "sidebar") + sidebar.setStyleSheet( + "QFrame { background-color: #11151c; border-right: 1px solid #2a3040; }" + ) + sidebar_layout = QVBoxLayout(sidebar) + sidebar_layout.setContentsMargins(0, 20, 0, 0) + sidebar_layout.setSpacing(0) + + title = QLabel("LEARN Pipeline") + title.setStyleSheet( + "color: #6c63ff; font-size: 14px; font-weight: bold; " + "letter-spacing: 1.5px; text-transform: uppercase; " + "padding: 0 20px 20px 20px; background: transparent;" + ) + sidebar_layout.addWidget(title) + + self._step_indicators: list[StepIndicator] = [] + for i, name in enumerate(self.STEP_NAMES): + si = StepIndicator(i + 1, name) + si.clicked.connect(self._on_step_clicked) + self._step_indicators.append(si) + sidebar_layout.addWidget(si) + + sidebar_layout.addStretch() + + version_label = QLabel("learn_upload v0.1.0") + version_label.setStyleSheet( + "color: #5c6578; font-size: 11px; padding: 16px 20px; " + "border-top: 1px solid #2a3040; background: transparent;" + ) + sidebar_layout.addWidget(version_label) + + main_layout.addWidget(sidebar) + + # -- Right panel -- + right_panel = QWidget() + right_layout = QVBoxLayout(right_panel) + right_layout.setContentsMargins(0, 0, 0, 0) + right_layout.setSpacing(0) + + # Header + header = QFrame() + header.setStyleSheet("background: #0f1117; border-bottom: 1px solid #2a3040;") + header.setFixedHeight(56) + header_layout = QHBoxLayout(header) + header_layout.setContentsMargins(24, 0, 24, 0) + self._header_title = QLabel("Configuration") + self._header_title.setStyleSheet( + "font-size: 18px; font-weight: bold; color: #e2e8f0; background: transparent;" + ) + header_layout.addWidget(self._header_title) + header_layout.addStretch() + right_layout.addWidget(header) + + # Stacked widget for pages + self._stack = QStackedWidget() + right_layout.addWidget(self._stack) + + # Pages + self._config_page = ConfigPage() + self._preview_page = PreviewPage() + self._anon_page = ProgressPage("Anonymise") + self._sort_page = ProgressPage("Folder Sort") + self._pii_page = PiiResultPage() + self._report_page = ReportPage() + + self._stack.addWidget(self._config_page) + self._stack.addWidget(self._preview_page) + self._stack.addWidget(self._sort_page) + self._stack.addWidget(self._anon_page) + self._stack.addWidget(self._pii_page) + self._stack.addWidget(self._report_page) + + # Bottom button bar + btn_bar = QFrame() + btn_bar.setStyleSheet("background: #0f1117; border-top: 1px solid #2a3040;") + btn_bar.setFixedHeight(60) + btn_layout = QHBoxLayout(btn_bar) + btn_layout.setContentsMargins(24, 0, 24, 0) + + self._btn_back = QPushButton("Back") + self._btn_back.clicked.connect(self._on_back) + btn_layout.addWidget(self._btn_back) + + btn_layout.addStretch() + + self._btn_continue = QPushButton("Continue") + self._btn_continue.setProperty("class", "primary") + self._btn_continue.clicked.connect(self._on_continue) + btn_layout.addWidget(self._btn_continue) + + right_layout.addWidget(btn_bar) + main_layout.addWidget(right_panel) + + # Initial state + self._go_to_step(0) + + # Attach log handler to the progress page terminals + self._log_handler_anon = QtLogHandler(self._anon_page.terminal) + self._log_handler_anon.setFormatter(logging.Formatter("%(asctime)s %(levelname)-7s %(name)s -- %(message)s")) + self._log_handler_sort = QtLogHandler(self._sort_page.terminal) + self._log_handler_sort.setFormatter(logging.Formatter("%(asctime)s %(levelname)-7s %(name)s -- %(message)s")) + + # -- Worker management -- + + def _start_worker(self, worker: QThread) -> None: + """Store worker reference to prevent GC, then start it.""" + self._active_worker = worker + self._workers.append(worker) + worker.finished.connect(lambda: self._cleanup_worker(worker)) + worker.start() + + def _cleanup_worker(self, worker: QThread) -> None: + """Remove finished worker from the keep-alive list.""" + try: + self._workers.remove(worker) + except ValueError: + pass + + # -- Navigation -- + + def _go_to_step(self, step: int): + self._current_step = step + self._stack.setCurrentIndex(step) + self._header_title.setText(self.STEP_NAMES[step]) + + for i, si in enumerate(self._step_indicators): + if i == step: + si.set_state("active") + elif i in self._completed_steps: + si.set_state("completed") + else: + si.set_state("future") + + # Update button labels and visibility + self._btn_back.setVisible(step > 0) + + # If this step is already completed, enable Continue to advance + already_done = step in self._completed_steps + + if step == 0: + self._btn_continue.setText("Continue to Preview") + self._btn_continue.setEnabled(True) + elif step == 1: + self._btn_continue.setText("Start Folder Sort") + self._btn_continue.setEnabled(already_done or True) + elif step == 2: + self._btn_continue.setText("Start Anonymisation") + self._btn_continue.setEnabled(already_done) + elif step == 3: + self._btn_continue.setText("Run PII Verification") + self._btn_continue.setEnabled(already_done) + elif step == 4: + self._btn_continue.setText("Generate CBCT Report") + self._btn_continue.setEnabled(already_done) + elif step == 5: + self._btn_continue.setText("New Patient") + self._btn_continue.setEnabled(already_done) + + def _on_step_clicked(self, step: int): + """Navigate to a completed step for review.""" + if step in self._completed_steps: + self._go_to_step(step) + + def _on_back(self): + if self._current_step > 0: + self._go_to_step(self._current_step - 1) + + def _on_continue(self): + step = self._current_step + # If both this step AND the next are already completed, just advance + # (user is re-visiting a past step). Otherwise fall through to + # actually run the next step's worker. + if step in self._completed_steps and (step + 1) in self._completed_steps and step < 5: + self._go_to_step(step + 1) + return + if step == 0: + self._submit_config() + elif step == 1: + self._start_folder_sort() + elif step == 2: + self._start_anonymise() + elif step == 3: + self._start_pii_check() + elif step == 4: + self._start_report() + elif step == 5: + self._reset_for_new_patient() + + # -- Step 1: Config -- + + def _submit_config(self): + error = self._config_page.validate() + if error: + QMessageBox.warning(self, "Validation Error", error) + return + + self._config = self._config_page.get_config() + self._save_config(self._config) + self._completed_steps.add(0) + self._go_to_step(1) + self._run_discovery() + + # -- Step 2: Discovery -- + + def _run_discovery(self): + self._preview_page.set_loading(True) + self._btn_continue.setEnabled(False) + + cfg = self._config + patient_dir = Path(cfg["source_path"]) + images_subdir = cfg.get("images_subdir", "IMAGES").strip() or "IMAGES" + + self._mapper = LearnFolderMapper( + patient_dir=patient_dir, + anon_id=cfg["anon_id"], + site_name=cfg["site_name"], + output_base=Path(cfg["output_path"]), + images_subdir=images_subdir, + ) + + worker = DiscoveryWorker(self._mapper) + worker.finished.connect(self._on_discovery_done) + worker.error.connect(self._on_discovery_error) + self._start_worker(worker) + + def _on_discovery_done(self, data: dict): + self._preview_page.populate(data) + self._btn_continue.setEnabled(True) + self._active_worker = None + + def _on_discovery_error(self, message: str): + self._preview_page.set_loading(False) + self._preview_page.status.setText(f"Discovery failed: {message}") + self._preview_page.status.setStyleSheet( + "color: #ef4444; font-size: 14px; padding: 40px;" + ) + self._active_worker = None + + # -- Step 3: Anonymise -- + + def _start_anonymise(self): + self._completed_steps.add(2) + self._go_to_step(3) + self._anon_page.reset() + self._anon_page.set_indeterminate("Scanning files...") + + # Attach log handler + logging.getLogger().addHandler(self._log_handler_anon) + + worker = AnonymiseWorker(self._config) + worker.progress.connect(self._on_anon_progress) + worker.finished.connect(self._on_anon_done) + worker.error.connect(self._on_anon_error) + self._start_worker(worker) + + def _on_anon_progress(self, current: int, total: int, filename: str): + self._anon_page.set_progress(current, total, filename) + + def _on_anon_done(self, data: dict): + logging.getLogger().removeHandler(self._log_handler_anon) + + self._anon_page.set_complete("Anonymisation complete") + + self._anon_page.add_stat(str(data.get("dcm", 0)), "DICOM") + self._anon_page.add_stat(str(data.get("xml", 0)), "XML") + self._anon_page.add_stat(str(data.get("ini", 0)), "INI") + self._anon_page.add_stat(str(data.get("tps_imported", 0)), "TPS Imported") + errors = data.get("errors", 0) + self._anon_page.add_stat( + str(errors), "Errors", "#ef4444" if errors > 0 else "#6c63ff" + ) + + self._completed_steps.add(3) + self._btn_continue.setEnabled(True) + self._active_worker = None + + def _on_anon_error(self, message: str): + logging.getLogger().removeHandler(self._log_handler_anon) + self._anon_page.show_error(message) + self._active_worker = None + + # -- Step 4: Folder Sort -- + + def _start_folder_sort(self): + self._completed_steps.add(1) + self._go_to_step(2) + self._sort_page.reset() + self._sort_page.set_indeterminate("Running folder sort...") + + logging.getLogger().addHandler(self._log_handler_sort) + + worker = FolderSortWorker(self._mapper, self._config) + worker.progress.connect( + lambda cur, tot, msg: self._sort_page.set_progress(cur, tot, msg) + ) + worker.finished.connect(self._on_sort_done) + worker.error.connect(self._on_sort_error) + self._start_worker(worker) + + def _on_sort_done(self, data: dict): + logging.getLogger().removeHandler(self._log_handler_sort) + + dry = data.get("dry_run", False) + self._sort_page.set_complete("Dry run complete" if dry else "Folder sort complete") + + self._sort_page.add_stat(str(data.get("sessions", 0)), "Sessions") + self._sort_page.add_stat(str(data.get("fractions", 0)), "Fractions") + fc = data.get("files_copied", {}) + self._sort_page.add_stat(str(fc.get("his", 0)), ".his Files") + self._sort_page.add_stat(str(fc.get("scan", 0)), "SCAN Files") + self._sort_page.add_stat(str(fc.get("rps", 0)), "RPS Files") + self._sort_page.add_stat(str(fc.get("ini", 0)), "INI Files") + + self._completed_steps.add(2) + self._btn_continue.setEnabled(True) + self._active_worker = None + + def _on_sort_error(self, message: str): + logging.getLogger().removeHandler(self._log_handler_sort) + self._sort_page.show_error(message) + self._active_worker = None + + # -- Step 5: PII Verification -- + + def _start_pii_check(self): + self._completed_steps.add(3) + self._go_to_step(4) + self._pii_page.set_loading(True) + self._btn_continue.setEnabled(False) + + worker = PiiCheckWorker(self._config) + worker.finished.connect(self._on_pii_done) + worker.error.connect(self._on_pii_error) + self._start_worker(worker) + + def _on_pii_done(self, data: dict): + self._pii_page.populate(data) + self._completed_steps.add(4) + self._btn_continue.setEnabled(True) + self._active_worker = None + + def _on_pii_error(self, message: str): + self._pii_page.set_loading(False) + self._pii_page.banner.setText(f"Error: {message}") + self._pii_page.banner.setStyleSheet( + "background: rgba(239, 68, 68, 0.1); border: 1px solid #ef4444; " + "border-radius: 6px; padding: 16px; color: #ef4444; " + "font-size: 16px; font-weight: bold;" + ) + self._pii_page.banner.show() + self._active_worker = None + + # -- Step 6: CBCT Shift Report -- + + def _start_report(self): + self._completed_steps.add(4) + self._go_to_step(5) + self._report_page.set_loading(True) + self._btn_continue.setEnabled(False) + + cfg = self._config + output_base = Path(cfg["output_path"]) + site_name = cfg["site_name"].strip() + anon_id = cfg["anon_id"].strip() + patient_images_path = str(output_base / site_name / "Patient Images" / anon_id) + + worker = ReportWorker(patient_images_path) + worker.finished.connect(self._on_report_done) + worker.error.connect(self._on_report_error) + self._start_worker(worker) + + def _on_report_done(self, markdown: str): + self._report_page.set_report(markdown) + + # Save report to Patient Files folder + cfg = self._config + output_base = Path(cfg["output_path"]) + site_name = cfg["site_name"].strip() + anon_id = cfg["anon_id"].strip() + report_dir = output_base / site_name / "Patient Files" / anon_id + report_dir.mkdir(parents=True, exist_ok=True) + report_path = report_dir / "cbct_shift_report.md" + try: + report_path.write_text(markdown, encoding="utf-8") + logger.info("Saved CBCT shift report to %s", report_path) + except Exception: + logger.warning("Failed to save shift report to %s", report_path, exc_info=True) + + self._completed_steps.add(5) + self._btn_continue.setText("New Patient") + self._btn_continue.setEnabled(True) + self._active_worker = None + + def _on_report_error(self, message: str): + self._report_page.show_error(message) + self._completed_steps.add(5) + self._btn_continue.setText("New Patient") + self._btn_continue.setEnabled(True) + self._active_worker = None + + # -- Reset -- + + def _reset_for_new_patient(self): + self._config = {} + self._mapper = None + self._completed_steps.clear() + self._go_to_step(0) + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +def _enable_dark_title_bar(hwnd: int) -> None: + """Use DwmSetWindowAttribute to enable immersive dark mode title bar on Windows 11.""" + try: + DWMWA_USE_IMMERSIVE_DARK_MODE = 20 + value = ctypes.c_int(1) + ctypes.windll.dwmapi.DwmSetWindowAttribute( + hwnd, DWMWA_USE_IMMERSIVE_DARK_MODE, + ctypes.byref(value), ctypes.sizeof(value), + ) + except Exception: + pass # Non-Windows or unsupported version + + +def main() -> None: + if sys.stderr is not None: + faulthandler.enable() + setup_logging(logging.INFO) + + app = QApplication(sys.argv) + app.setStyleSheet(DARK_QSS) + app.setFont(QFont("Segoe UI", 10)) + + window = LearnPipelineWindow() + window.show() + + if sys.platform == "win32": + hwnd = int(window.winId()) + _enable_dark_title_bar(hwnd) + + sys.exit(app.exec()) + + +if __name__ == "__main__": + main() diff --git a/learn_upload/utils.py b/learn_upload/utils.py new file mode 100644 index 0000000..9cbff50 --- /dev/null +++ b/learn_upload/utils.py @@ -0,0 +1,276 @@ +""" +Shared parsing utilities for Elekta XVI data files. + +Functions here are generalised from patterns in the existing standalone scripts: +- scripts/elektafdt_crawler.py (XML parsing) +- scripts/extract_elekta_rps_matrices.py (ZIP-embedded INI parsing from RPS DICOM) + +They are designed to be reused across anonymise_dicom, folder_sort, +treatment_notes, and upload_workflow modules. +""" + +import io +import logging +import re +import xml.etree.ElementTree as ET +import zipfile +from datetime import datetime +from pathlib import Path +from typing import Optional + +from learn_upload.config import RPS_ZIP_TAG + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Plain INI parsing (Reconstruction/*.INI files) +# --------------------------------------------------------------------------- + +# Fields we extract from XVI plain INI files. The regex approach (not +# configparser) is inherited from scripts/extract_elekta_rps_matrices.py because XVI +# INI files use non-standard formatting that configparser chokes on. +_INI_FIELDS = [ + "PatientID", + "TreatmentID", + "TreatmentUID", + "ReferenceUID", + "FirstName", + "LastName", + "ScanUID", + "TubeKV", + "TubeMA", + "CollimatorName", +] + + +def parse_xvi_ini(ini_text: str) -> dict: + """Parse an Elekta XVI INI file and return extracted fields. + + Handles both ``[IDENTIFICATION]``-section fields from ``.INI`` files and + reconstruction parameters (TubeKV, TubeMA, ScanUID, CollimatorName) from + ``.INI.XVI`` files — the same regex works on either since the key=value + format is identical. + + Parameters + ---------- + ini_text : str + Raw text content of the INI file. + + Returns + ------- + dict + Mapping of field name -> string value for every field found. + Missing fields are omitted (not set to None). + """ + result = {} + for field in _INI_FIELDS: + match = re.search(rf"^{field}=(.+)$", ini_text, re.MULTILINE) + if match: + value = match.group(1).strip() + result[field] = value + return result + + +# --------------------------------------------------------------------------- +# ScanUID datetime parsing +# --------------------------------------------------------------------------- + +# ScanUID format example: +# 1.3.46.423632.33783920233217242713.224.2023-03-21165402768 +# The datetime is embedded at the end: YYYY-MM-DDHHMMSSmmm +_SCAN_DATETIME_PATTERN = re.compile( + r"(\d{4})-(\d{2})-(\d{2})(\d{2})(\d{2})(\d{2})(\d{3})$" +) + + +def parse_scan_datetime(scan_uid: str) -> Optional[datetime]: + """Extract the embedded datetime from an Elekta ScanUID string. + + Parameters + ---------- + scan_uid : str + Full ScanUID value, e.g. + ``"1.3.46.423632.33783920233217242713.224.2023-03-21165402768"`` + + Returns + ------- + datetime or None + Parsed datetime, or None if the pattern is not found. + """ + match = _SCAN_DATETIME_PATTERN.search(scan_uid) + if not match: + logger.warning("Could not parse datetime from ScanUID: %s", scan_uid) + return None + + year, month, day, hour, minute, second, ms = (int(g) for g in match.groups()) + try: + return datetime(year, month, day, hour, minute, second, ms * 1000) + except ValueError as exc: + logger.warning("Invalid datetime values in ScanUID %s: %s", scan_uid, exc) + return None + + +# --------------------------------------------------------------------------- +# _Frames.xml parsing +# --------------------------------------------------------------------------- + +def parse_frames_xml(xml_path: Path) -> dict: + """Parse a ``_Frames.xml`` file and return treatment + acquisition metadata. + + Refactored from ``scripts/elektafdt_crawler.py:get_plan_name_from_xml()``. + + Parameters + ---------- + xml_path : Path + Path to the ``_Frames.xml`` file. + + Returns + ------- + dict + Keys: + - ``treatment_id`` (str or None) — ```` + - ``acquisition_preset`` (str or None) — ```` + - ``dicom_uid`` (str or None) — ```` + - ``kv`` (float or None) — ```` + - ``ma`` (float or None) — ```` + """ + result: dict = { + "treatment_id": None, + "acquisition_preset": None, + "dicom_uid": None, + "kv": None, + "ma": None, + } + try: + tree = ET.parse(xml_path) + root = tree.getroot() + + # Treatment ID + treatment_el = root.find("Treatment") + if treatment_el is not None: + id_el = treatment_el.find("ID") + if id_el is not None and id_el.text: + result["treatment_id"] = id_el.text.strip() + logger.info("Found treatment_id '%s' in %s", result["treatment_id"], xml_path) + else: + logger.warning("No Treatment/ID found in %s", xml_path) + + # Image acquisition metadata + image_el = root.find("Image") + if image_el is not None: + preset_el = image_el.find("AcquisitionPresetName") + if preset_el is not None and preset_el.text: + result["acquisition_preset"] = preset_el.text.strip() + + uid_el = image_el.find("DicomUID") + if uid_el is not None and uid_el.text: + result["dicom_uid"] = uid_el.text.strip() + + kv_el = image_el.find("kV") + if kv_el is not None and kv_el.text: + try: + result["kv"] = float(kv_el.text.strip()) + except ValueError: + logger.warning("Non-numeric kV value in %s: %s", xml_path, kv_el.text) + + ma_el = image_el.find("mA") + if ma_el is not None and ma_el.text: + try: + result["ma"] = float(ma_el.text.strip()) + except ValueError: + logger.warning("Non-numeric mA value in %s: %s", xml_path, ma_el.text) + + except ET.ParseError as exc: + logger.error("XML parse error in %s: %s", xml_path, exc) + except OSError as exc: + logger.error("Could not read %s: %s", xml_path, exc) + + return result + + +# --------------------------------------------------------------------------- +# Couch shift extraction from INI text +# --------------------------------------------------------------------------- + +def parse_couch_shifts(ini_text: str) -> Optional[dict]: + """Extract CouchShiftLat/Long/Height from XVI INI text. + + Parameters + ---------- + ini_text : str + Raw INI text content (from ``.INI.XVI`` or plain INI file). + + Returns + ------- + dict or None + ``{"lateral": float, "longitudinal": float, "vertical": float}`` + if all three shift keys are found, otherwise None. + """ + couch_lat = re.search(r"CouchShiftLat=(.+)", ini_text) + couch_long = re.search(r"CouchShiftLong=(.+)", ini_text) + couch_height = re.search(r"CouchShiftHeight=(.+)", ini_text) + + if couch_lat and couch_long and couch_height: + try: + return { + "lateral": float(couch_lat.group(1).strip()), + "longitudinal": float(couch_long.group(1).strip()), + "vertical": float(couch_height.group(1).strip()), + } + except ValueError as exc: + logger.warning("Non-numeric couch shift value: %s", exc) + return None + + return None + + +# --------------------------------------------------------------------------- +# ZIP-embedded INI extraction from RPS DICOM +# --------------------------------------------------------------------------- + +def extract_ini_from_rps(dcm_path: Path) -> Optional[str]: + """Read an Elekta RPS DICOM file and return the embedded INI text. + + The RPS DICOM stores a ZIP archive in private tag ``(0021,103A)``. + Inside the ZIP is a ``.INI.XVI`` file with registration data. + + Refactored from ``scripts/extract_elekta_rps_matrices.py:extract_zip()``. + + Parameters + ---------- + dcm_path : Path + Path to the ``.RPS.dcm`` file. + + Returns + ------- + str or None + Raw INI text content, or None on failure. + """ + try: + import pydicom + except ImportError: + logger.error("pydicom is required for RPS extraction but not installed") + return None + + try: + dcm = pydicom.dcmread(str(dcm_path)) + except Exception as exc: + logger.error("Failed to read DICOM %s: %s", dcm_path, exc) + return None + + if RPS_ZIP_TAG not in dcm: + logger.error("ZIP data tag %s not found in %s", RPS_ZIP_TAG, dcm_path) + return None + + zip_data = dcm[RPS_ZIP_TAG].value + try: + zip_buffer = io.BytesIO(zip_data) + with zipfile.ZipFile(zip_buffer, "r") as zf: + ini_files = [f for f in zf.namelist() if f.endswith(".INI.XVI")] + if not ini_files: + logger.error("No .INI.XVI file in ZIP from %s", dcm_path) + return None + return zf.read(ini_files[0]).decode("utf-8", errors="ignore") + except zipfile.BadZipFile: + logger.error("Invalid ZIP data in %s", dcm_path) + return None diff --git a/learn_upload/verify_pii.py b/learn_upload/verify_pii.py new file mode 100644 index 0000000..e69a26b --- /dev/null +++ b/learn_upload/verify_pii.py @@ -0,0 +1,151 @@ +"""Post-anonymisation PII verification for LEARN data transfer pipeline. + +Scans an output directory for residual patient-identifiable strings in DICOM +tags, XML text, plain-text files, and filenames. +""" + +import argparse +import logging +import sys +from pathlib import Path + +import pydicom + +logger = logging.getLogger(__name__) + +# pydicom VR types that contain human-readable strings worth checking. +_STRING_VRS = { + "LO", "SH", "PN", "LT", "ST", "UT", "DA", "DS", "IS", "CS", + "AE", "AS", "DT", "TM", "UC", "UI", +} + + +def verify_no_pii(directory: Path, pii_strings: list[str]) -> list[dict]: + """Scan *directory* for residual PII and return a list of findings. + + Parameters + ---------- + directory : Path + Root directory to scan recursively. + pii_strings : list[str] + Substrings to search for (case-insensitive). + + Returns + ------- + list[dict] + Each finding is ``{"file": Path, "location": str, "matched": str}``. + """ + directory = Path(directory) + if not directory.is_dir(): + logger.error("Directory does not exist: %s", directory) + return [] + + pii_lower = [s.lower() for s in pii_strings] + findings: list[dict] = [] + files_scanned = 0 + + for path in sorted(directory.rglob("*")): + if not path.is_file(): + continue + files_scanned += 1 + + # --- Check filename --- + name_lower = path.name.lower() + for pii, original in zip(pii_lower, pii_strings): + if pii in name_lower: + findings.append({ + "file": path, + "location": "filename", + "matched": original, + }) + + suffix = path.suffix.lower() + + # --- DICOM files --- + if suffix in (".dcm",): + findings.extend(_check_dicom(path, pii_lower, pii_strings)) + + # --- XML files --- + elif suffix in (".xml",): + findings.extend(_check_text_file(path, pii_lower, pii_strings, "xml text")) + + # --- Plain text files --- + elif suffix in (".txt",): + findings.extend(_check_text_file(path, pii_lower, pii_strings, "text content")) + + # --- Human-readable summary --- + print(f"\nPII Verification: scanned {files_scanned} files in {directory}") + if findings: + print(f"FAIL — {len(findings)} PII finding(s):") + for f in findings: + print(f" {f['file']} [{f['location']}] matched '{f['matched']}'") + else: + print("PASS — no residual PII detected") + + return findings + + +def _check_dicom( + path: Path, pii_lower: list[str], pii_originals: list[str], +) -> list[dict]: + """Check all string-valued DICOM data elements for PII substrings.""" + findings: list[dict] = [] + try: + ds = pydicom.dcmread(path, force=True) + except Exception: + logger.warning("Could not read DICOM file: %s", path) + return findings + + for elem in ds.iterall(): + if elem.VR not in _STRING_VRS: + continue + value_str = str(elem.value).lower() + for pii, original in zip(pii_lower, pii_originals): + if pii in value_str: + tag_name = elem.keyword or str(elem.tag) + findings.append({ + "file": path, + "location": f"tag {tag_name} {elem.tag}", + "matched": original, + }) + return findings + + +def _check_text_file( + path: Path, + pii_lower: list[str], + pii_originals: list[str], + location_label: str, +) -> list[dict]: + """Read a text file and check for PII substrings.""" + findings: list[dict] = [] + try: + text = path.read_text(encoding="utf-8", errors="replace").lower() + except Exception: + logger.warning("Could not read file: %s", path) + return findings + + for pii, original in zip(pii_lower, pii_originals): + if pii in text: + findings.append({ + "file": path, + "location": location_label, + "matched": original, + }) + return findings + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Verify an anonymised directory contains no residual PII.", + ) + parser.add_argument("directory", type=Path, help="Directory to scan") + parser.add_argument("pii_strings", nargs="+", help="PII substrings to search for") + args = parser.parse_args() + + findings = verify_no_pii(args.directory, args.pii_strings) + sys.exit(1 if findings else 0) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..8f53d7f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,44 @@ +[build-system] +requires = ["setuptools>=64"] +build-backend = "setuptools.backends._legacy:_Backend" + +[project] +name = "learn-upload" +version = "0.1.0" +description = "Automation tools for the LEARN data transfer pipeline" +requires-python = ">=3.9" +dependencies = [ + "pydicom>=3.0.1", +] + +[project.scripts] +learn-upload = "learn_upload.__main__:main" + +[project.optional-dependencies] +gui = ["PyQt6", "numpy"] +dev = [ + "pytest>=7.0", + "ruff>=0.4", +] + +[tool.pytest.ini_options] +testpaths = ["tests"] + +[tool.ruff] +target-version = "py39" +line-length = 100 + +[tool.ruff.lint] +select = [ + "E", # pycodestyle errors + "W", # pycodestyle warnings + "F", # pyflakes + "I", # isort + "UP", # pyupgrade +] +ignore = [ + "E501", # line too long +] + +[tool.ruff.lint.per-file-ignores] +"tests/*" = ["E501"] diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..984cf01 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,3 @@ +pydicom>=3.0.1 +pytest>=7.0 +ruff>=0.4 diff --git a/scripts/dir_tree.py b/scripts/dir_tree.py new file mode 100644 index 0000000..c69f1cf --- /dev/null +++ b/scripts/dir_tree.py @@ -0,0 +1,143 @@ +"""Visualise a directory tree for auditing folder transfers. + +Recursively walks a directory and prints a visual tree using box-drawing +characters. File listings are truncated to 5 per folder (with total count) +to keep output manageable on large directories. + +Usage: + # Print tree to default output file (dir_tree_output.md beside target) + python scripts/dir_tree.py path/to/directory + + # Specify output file + python scripts/dir_tree.py path/to/directory -o audit_report.md + + # Change max files shown per folder (default: 5) + python scripts/dir_tree.py path/to/directory --max-files 10 +""" + +import argparse +import sys +from pathlib import Path + +MAX_FILES_DEFAULT = 5 + + +def build_tree( + directory: Path, prefix: str, max_files: int, progress: bool = False, +) -> tuple[list[str], int, int]: + """Recursively build tree lines for *directory*. + + Returns (lines, total_folders, total_files). + """ + if progress: + print(f"Scanning: {directory}", file=sys.stderr, flush=True) + + lines: list[str] = [] + total_folders = 0 + total_files = 0 + + try: + entries = sorted(directory.iterdir(), key=lambda e: (e.is_file(), e.name.lower())) + except PermissionError: + lines.append(f"{prefix}[Permission Denied]") + return lines, 0, 0 + + dirs = [e for e in entries if e.is_dir()] + files = [e for e in entries if e.is_file()] + + total_folders += len(dirs) + total_files += len(files) + + # Combine items for connector logic: dirs first, then (possibly truncated) files + items: list[str] = [] + subtrees: dict[int, tuple[list[str], int, int]] = {} + + for i, d in enumerate(dirs): + items.append(d.name + "/") + subtrees[len(items) - 1] = build_tree(d, "", max_files, progress=progress) + + if len(files) <= max_files: + for f in files: + items.append(f.name) + else: + for f in files[:max_files]: + items.append(f.name) + remaining = len(files) - max_files + items.append(f"... and {remaining} more file{'s' if remaining != 1 else ''}") + + for idx, item in enumerate(items): + is_last = idx == len(items) - 1 + connector = "└── " if is_last else "├── " + lines.append(f"{prefix}{connector}{item}") + + if idx in subtrees: + sub_lines, sub_folders, sub_files = subtrees[idx] + extension = " " if is_last else "│ " + for sl in sub_lines: + lines.append(f"{prefix}{extension}{sl}") + total_folders += sub_folders + total_files += sub_files + + return lines, total_folders, total_files + + +def generate_tree( + directory: Path, max_files: int = MAX_FILES_DEFAULT, progress: bool = False, +) -> str: + """Return the full tree string for *directory*.""" + root = directory.resolve() + if not root.is_dir(): + return f"Error: '{root}' is not a directory." + + tree_lines, total_folders, total_files = build_tree(root, "", max_files, progress=progress) + + output_lines = [ + f"# Directory Tree: {root.name}", + "", + "```", + f"{root.name}/", + ] + output_lines.extend(tree_lines) + output_lines.append("```") + output_lines.append("") + output_lines.append(f"**Summary:** {total_folders} folders, {total_files} files") + output_lines.append("") + + return "\n".join(output_lines) + + +def main(argv: list[str] | None = None) -> None: + parser = argparse.ArgumentParser( + description="Generate a directory tree audit report (.md)." + ) + parser.add_argument("directory", type=Path, help="Root directory to scan.") + parser.add_argument( + "-o", + "--output", + type=Path, + default=None, + help="Output .md file (default: dir_tree_output.md beside target directory).", + ) + parser.add_argument( + "--max-files", + type=int, + default=MAX_FILES_DEFAULT, + help=f"Max files shown per folder before truncation (default: {MAX_FILES_DEFAULT}).", + ) + + args = parser.parse_args(argv) + + if not args.directory.is_dir(): + print(f"Error: '{args.directory}' is not a directory.", file=sys.stderr) + sys.exit(1) + + output_path = args.output or (args.directory.resolve().parent / "dir_tree_output.md") + + result = generate_tree(args.directory, max_files=args.max_files, progress=True) + + output_path.write_text(result, encoding="utf-8") + print(f"Tree written to {output_path}") + + +if __name__ == "__main__": + main() diff --git a/scripts/elektafdt_crawler.py b/scripts/elektafdt_crawler.py new file mode 100644 index 0000000..5c84458 --- /dev/null +++ b/scripts/elektafdt_crawler.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python3 +""" +Script to extract treatment plan names from patient directories and create a CSV file. +Scans all patient_* directories and reads _Frames.xml files to extract actual plan names. +""" + +import csv +import logging +import xml.etree.ElementTree as ET +from pathlib import Path + +# Set up logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +def get_plan_name_from_xml(xml_file_path): + """ + Extract the plan name from a _Frames.xml file. + + Args: + xml_file_path (str): Path to the _Frames.xml file + + Returns: + str: Plan name or error message if not found + """ + try: + # Parse the XML file + tree = ET.parse(xml_file_path) + root = tree.getroot() + + # Look for Treatment/ID element + treatment_element = root.find('Treatment') + if treatment_element is not None: + id_element = treatment_element.find('ID') + if id_element is not None and id_element.text: + plan_name = id_element.text.strip() + logger.info(f"Found plan name '{plan_name}' in {xml_file_path}") + return plan_name + + logger.warning(f"No Treatment/ID found in XML file: {xml_file_path}") + return "No Treatment ID found" + + except ET.ParseError as e: + logger.error(f"Error parsing XML file {xml_file_path}: {str(e)}") + return "XML Parse Error" + except Exception as e: + logger.error(f"Error reading XML file {xml_file_path}: {str(e)}") + return "Error Reading XML" + +def scan_patient_directories(base_path): + """ + Scan the base directory for patient directories and extract plan names from XML files. + + Args: + base_path (str): Base directory path containing patient directories + + Returns: + list: List of tuples (patient_dir, plan_name) + """ + base_path = Path(base_path) + results = [] + + if not base_path.exists(): + logger.error(f"Base directory does not exist: {base_path}") + return results + + # Find all patient directories + patient_dirs = [] + for item in base_path.iterdir(): + if item.is_dir() and item.name.startswith('patient_'): + patient_dirs.append(item.name) + + # Sort alphanumerically + patient_dirs.sort() + logger.info(f"Found {len(patient_dirs)} patient directories") + + for patient_dir in patient_dirs: + patient_path = base_path / patient_dir + images_path = patient_path / "IMAGES" + + logger.info(f"Processing {patient_dir}") + + if not images_path.exists(): + logger.warning(f"IMAGES directory not found for {patient_dir}") + results.append((patient_dir, "No IMAGES directory")) + continue + + # Find img_* directories in the IMAGES directory + img_dirs = [item for item in images_path.iterdir() if item.is_dir() and item.name.startswith('img_')] + + if not img_dirs: + logger.warning(f"No img_* directories found in {images_path}") + results.append((patient_dir, "No img directories found")) + continue + + # Process the first img directory found (assuming one plan per patient) + img_dir = img_dirs[0] + frames_xml_path = img_dir / "_Frames.xml" + + if not frames_xml_path.exists(): + logger.warning(f"_Frames.xml not found in {img_dir}") + results.append((patient_dir, "No _Frames.xml found")) + continue + + plan_name = get_plan_name_from_xml(str(frames_xml_path)) + results.append((patient_dir, plan_name)) + + if len(img_dirs) > 1: + logger.info(f"Multiple img directories found for {patient_dir}, using first one: {img_dir.name}") + + return results + +def create_csv_file(results, output_file): + """ + Create a CSV file with the results. + + Args: + results (list): List of tuples (patient_dir, plan_name) + output_file (str): Output CSV file path + """ + try: + with open(output_file, 'w', newline='', encoding='utf-8') as csvfile: + writer = csv.writer(csvfile) + writer.writerow(['Patient_Directory', 'Plan_Name']) + writer.writerows(results) + + logger.info(f"CSV file created successfully: {output_file}") + print(f"CSV file created: {output_file}") + print(f"Total records: {len(results)}") + + except Exception as e: + logger.error(f"Error creating CSV file: {str(e)}") + print(f"Error creating CSV file: {str(e)}") + +def main(): + # Configuration + base_directory = r"E:\XVI_COLLECTION\processed\20230403_Flinders" + output_csv = "patient_dicom_plans.csv" + + print("Treatment Plan Extractor") + print("=======================") + print(f"Scanning directory: {base_directory}") + print(f"Output file: {output_csv}") + print() + + # Scan directories and extract plan names + results = scan_patient_directories(base_directory) + + if not results: + print("No patient directories or treatment plans found.") + return + + # Create CSV file + create_csv_file(results, output_csv) + + # Display summary + print("\nSummary:") + print(f"Processed {len(results)} patient directories") + + # Show first few results as preview + print("\nFirst 5 results:") + for i, (patient_dir, plan_name) in enumerate(results[:5]): + print(f" {patient_dir}: {plan_name}") + + if len(results) > 5: + print(f" ... and {len(results) - 5} more") + +if __name__ == "__main__": + main() diff --git a/scripts/extract_elekta_rps_matrices.py b/scripts/extract_elekta_rps_matrices.py new file mode 100644 index 0000000..3c43a3a --- /dev/null +++ b/scripts/extract_elekta_rps_matrices.py @@ -0,0 +1,288 @@ +#!/usr/bin/env python3 +""" +Elekta XVI RPS DICOM Matrix Extractor + +Extracts registration/transformation matrices from Elekta XVI RPS DICOM exports. +These files use proprietary private tags with embedded ZIP files containing INI files +with the actual registration data. + +Author: Medical Physics Utility +Usage: python extract_elekta_rps_matrices.py +""" + +import io +import re +import sys +import zipfile +from pathlib import Path + +import numpy as np +import pydicom + + +class ElektaRPSExtractor: + """Extract registration matrices from Elekta XVI RPS DICOM files""" + + def __init__(self, dicom_path): + self.dicom_path = Path(dicom_path) + self.dcm = None + self.ini_content = None + self.matrices = {} + self.alignment_info = {} + + def read_dicom(self): + """Read the DICOM file""" + print(f"Reading DICOM file: {self.dicom_path}") + self.dcm = pydicom.dcmread(str(self.dicom_path)) + + # Verify it's an Elekta REG modality + if self.dcm.Modality != 'REG': + print(f"Warning: Modality is {self.dcm.Modality}, expected REG") + + if hasattr(self.dcm, 'Manufacturer'): + print(f"Manufacturer: {self.dcm.Manufacturer}") + + return True + + def extract_zip(self): + """Extract the embedded ZIP file from private DICOM tag""" + # Elekta stores ZIP data in private tag (0021,103A) + if (0x0021, 0x103A) not in self.dcm: + raise ValueError("ZIP data not found in expected tag (0021,103A)") + + zip_data = self.dcm[0x0021, 0x103A].value + + try: + zip_buffer = io.BytesIO(zip_data) + with zipfile.ZipFile(zip_buffer, 'r') as zf: + print(f"\nFound ZIP archive with {len(zf.namelist())} files:") + for filename in zf.namelist(): + print(f" - {filename}") + + # Find and read the .INI.XVI file (contains registration data) + ini_files = [f for f in zf.namelist() if f.endswith('.INI.XVI')] + if not ini_files: + raise ValueError("No .INI.XVI file found in ZIP archive") + + print(f"\nReading registration data from: {ini_files[0]}") + self.ini_content = zf.read(ini_files[0]).decode('utf-8', errors='ignore') + + except zipfile.BadZipFile: + raise ValueError("Invalid ZIP data in DICOM file") + + return True + + def parse_matrix(self, matrix_string): + """Parse a 16-element transformation matrix string into 4x4 numpy array""" + values = [float(x) for x in matrix_string.split()] + if len(values) != 16: + return None + + # Reshape into 4x4 matrix (row-major order) + matrix = np.array(values).reshape(4, 4) + return matrix + + def extract_matrices(self): + """Extract all transformation matrices from INI content""" + if not self.ini_content: + raise ValueError("No INI content loaded") + + # Extract unmatched transformation matrices + unmatched_pattern = r'OnlineToRefTransformUnMatched=(.+?)(?:\n|$)' + unmatched_matches = re.findall(unmatched_pattern, self.ini_content) + + # Extract correction transformation matrices + correction_pattern = r'OnlineToRefTransformCorrection=(.+?)(?:\n|$)' + correction_matches = re.findall(correction_pattern, self.ini_content) + + self.matrices['unmatched'] = [] + for matrix_str in unmatched_matches: + matrix = self.parse_matrix(matrix_str) + if matrix is not None: + self.matrices['unmatched'].append(matrix) + + self.matrices['correction'] = [] + for matrix_str in correction_matches: + matrix = self.parse_matrix(matrix_str) + if matrix is not None: + self.matrices['correction'].append(matrix) + + return len(self.matrices['unmatched']) + len(self.matrices['correction']) > 0 + + def extract_alignment_info(self): + """Extract alignment parameters and couch shifts""" + if not self.ini_content: + raise ValueError("No INI content loaded") + + # Alignment date/time + align_info = re.search(r'\[ALIGNMENT\.(\d+); ([\d:]+)\]', self.ini_content) + if align_info: + self.alignment_info['date'] = align_info.group(1) + self.alignment_info['time'] = align_info.group(2) + + # Clipbox alignment + clip_match = re.search(r'Align\.clip1=(.+)', self.ini_content) + if clip_match: + values = [float(x.strip()) for x in clip_match.group(1).split(',')] + self.alignment_info['clipbox'] = { + 'lateral': values[0], + 'longitudinal': values[1], + 'vertical': values[2], + 'rotation': values[3], + 'pitch': values[4], + 'roll': values[5] + } + + # Mask alignment + mask_match = re.search(r'Align\.mask1=(.+)', self.ini_content) + if mask_match: + values = [float(x.strip()) for x in mask_match.group(1).split(',')] + self.alignment_info['mask'] = { + 'lateral': values[0], + 'longitudinal': values[1], + 'vertical': values[2], + 'rotation': values[3], + 'pitch': values[4], + 'roll': values[5] + } + + # Couch shifts + couch_lat = re.search(r'CouchShiftLat=(.+)', self.ini_content) + couch_long = re.search(r'CouchShiftLong=(.+)', self.ini_content) + couch_height = re.search(r'CouchShiftHeight=(.+)', self.ini_content) + + if couch_lat and couch_long and couch_height: + self.alignment_info['couch_shifts'] = { + 'lateral': float(couch_lat.group(1).strip()), + 'longitudinal': float(couch_long.group(1).strip()), + 'vertical': float(couch_height.group(1).strip()) + } + + # Isocenter + isoc_match = re.search(r'IsocX=(.+?)\nIsocY=(.+?)\nIsocZ=(.+?)\n', self.ini_content) + if isoc_match: + self.alignment_info['isocenter'] = { + 'x': float(isoc_match.group(1).strip()), + 'y': float(isoc_match.group(2).strip()), + 'z': float(isoc_match.group(3).strip()) + } + + # Registration protocol + reg_protocol = re.search(r'RegistrationProtocol=(.+)', self.ini_content) + if reg_protocol: + self.alignment_info['registration_protocol'] = reg_protocol.group(1) + + return True + + def print_results(self): + """Print extracted matrices and alignment information""" + print("\n" + "="*70) + print("ELEKTA XVI REGISTRATION DATA") + print("="*70) + + if 'date' in self.alignment_info: + print(f"\nAlignment Date: {self.alignment_info['date']}") + print(f"Alignment Time: {self.alignment_info['time']}") + + if 'registration_protocol' in self.alignment_info: + print(f"Registration Protocol: {self.alignment_info['registration_protocol']}") + + # Print alignment parameters + if 'clipbox' in self.alignment_info: + cb = self.alignment_info['clipbox'] + print("\nClipbox Alignment:") + print(f" Translation (L/L/V): {cb['lateral']:.2f}, {cb['longitudinal']:.2f}, {cb['vertical']:.2f} cm") + print(f" Rotation (R/P/R): {cb['rotation']:.1f}°, {cb['pitch']:.1f}°, {cb['roll']:.1f}°") + + if 'mask' in self.alignment_info: + m = self.alignment_info['mask'] + print("\nMask Alignment:") + print(f" Translation (L/L/V): {m['lateral']:.2f}, {m['longitudinal']:.2f}, {m['vertical']:.2f} cm") + print(f" Rotation (R/P/R): {m['rotation']:.1f}°, {m['pitch']:.1f}°, {m['roll']:.1f}°") + + if 'couch_shifts' in self.alignment_info: + cs = self.alignment_info['couch_shifts'] + print("\nCouch Shifts (applied):") + print(f" Lateral: {cs['lateral']:.2f} cm") + print(f" Longitudinal: {cs['longitudinal']:.2f} cm") + print(f" Vertical: {cs['vertical']:.2f} cm") + + if 'isocenter' in self.alignment_info: + iso = self.alignment_info['isocenter'] + print("\nReference Isocenter:") + print(f" X: {iso['x']:.3f} cm") + print(f" Y: {iso['y']:.3f} cm") + print(f" Z: {iso['z']:.3f} cm") + + # Print transformation matrices + print("\n" + "="*70) + print("4x4 TRANSFORMATION MATRICES") + print("="*70) + + for i, matrix in enumerate(self.matrices.get('unmatched', []), 1): + print(f"\nOnlineToRefTransform_Unmatched #{i}:") + print(matrix) + + for i, matrix in enumerate(self.matrices.get('correction', []), 1): + print(f"\nOnlineToRefTransform_Correction #{i}:") + print(matrix) + + print("\n" + "="*70) + + def get_correction_matrix(self, index=0): + """ + Get the correction transformation matrix + + Parameters: + ----------- + index : int + Index of correction matrix (default 0 for first/only matrix) + + Returns: + -------- + numpy.ndarray : 4x4 transformation matrix + """ + if 'correction' not in self.matrices or len(self.matrices['correction']) == 0: + raise ValueError("No correction matrices found") + + if index >= len(self.matrices['correction']): + raise IndexError(f"Correction matrix index {index} out of range") + + return self.matrices['correction'][index] + + def extract_all(self): + """Convenience method to extract everything""" + self.read_dicom() + self.extract_zip() + self.extract_matrices() + self.extract_alignment_info() + return True + + +def main(): + if len(sys.argv) != 2: + print("Usage: python extract_elekta_rps_matrices.py ") + sys.exit(1) + + dicom_file = sys.argv[1] + + try: + extractor = ElektaRPSExtractor(dicom_file) + extractor.extract_all() + extractor.print_results() + + # Example: Access specific matrix + print("\n" + "="*70) + print("EXAMPLE: Accessing correction matrix programmatically") + print("="*70) + correction_matrix = extractor.get_correction_matrix(0) + print("Correction matrix (4x4):") + print(correction_matrix) + + except Exception as e: + print(f"\nError: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/scripts/read_dicom_tags.py b/scripts/read_dicom_tags.py new file mode 100644 index 0000000..3a0093c --- /dev/null +++ b/scripts/read_dicom_tags.py @@ -0,0 +1,93 @@ +"""Read and display all DICOM tags from a given DICOM file. + +Usage (CLI): + # Print tags to console + python read_dicom_tags.py path/to/file.dcm + + # Write tags to a text file + python read_dicom_tags.py path/to/file.dcm -o tags.txt + + # Include private tags + python read_dicom_tags.py path/to/file.dcm --private + + # Combine options + python read_dicom_tags.py path/to/file.dcm -o tags.txt --private + +Usage (Python): + from read_dicom_tags import read_dicom_tags + + # Print to console + read_dicom_tags("path/to/file.dcm") + + # Write to file with private tags + read_dicom_tags("path/to/file.dcm", output="tags.txt", show_private=True) +""" + +import argparse +import sys + +from pydicom import dcmread +from pydicom.errors import InvalidDicomError + + +def read_dicom_tags(filepath: str, output: str = None, show_private: bool = False) -> None: + """Print every DICOM tag in the file, optionally writing to an output file.""" + try: + ds = dcmread(filepath) + except (InvalidDicomError, FileNotFoundError, PermissionError) as e: + print(f"Error reading {filepath}: {e}", file=sys.stderr) + sys.exit(1) + + lines = [] + lines.append(f"File: {filepath}") + lines.append(f"SOP Class: {ds.SOPClassUID.name if 'SOPClassUID' in ds else 'N/A'}") + lines.append("-" * 80) + + for elem in ds.iterall(): + if not show_private and elem.tag.is_private: + continue + tag = f"({elem.tag.group:04X},{elem.tag.element:04X})" + vr = elem.VR + name = elem.keyword or elem.name + value = _format_value(elem) + lines.append(f"{tag} {vr:4s} {name:40s} {value}") + + text = "\n".join(lines) + "\n" + + if output: + with open(output, "w", encoding="utf-8") as f: + f.write(text) + print(f"Written to {output}") + else: + print(text, end="") + + +def _format_value(elem) -> str: + """Format a DICOM element value for display.""" + if elem.VR == "SQ": + return f"" + if elem.VR in ("OB", "OW", "OF", "OD", "UN"): + length = len(elem.value) if elem.value else 0 + return f"<{length} bytes>" + value = str(elem.value) + if len(value) > 120: + return value[:120] + "..." + return value + + +def main(): + parser = argparse.ArgumentParser(description="Read all DICOM tags from a file.") + parser.add_argument("file", help="Path to the DICOM file") + parser.add_argument( + "-o", "--output", help="Path to output text file (prints to console if omitted)" + ) + parser.add_argument( + "--private", action="store_true", help="Include private tags in output" + ) + args = parser.parse_args() + + read_dicom_tags(args.file, output=args.output, show_private=args.private) + + +if __name__ == "__main__": + main() diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_anonymise_dicom.py b/tests/test_anonymise_dicom.py new file mode 100644 index 0000000..71c26fd --- /dev/null +++ b/tests/test_anonymise_dicom.py @@ -0,0 +1,717 @@ +"""Tests for learn_upload.anonymise_dicom — DicomAnonymiser.""" + +import xml.etree.ElementTree as ET +from pathlib import Path + +import pydicom +import pytest +from pydicom.dataset import FileDataset +from pydicom.uid import ExplicitVRLittleEndian, generate_uid + +from learn_upload.anonymise_dicom import ( + DicomAnonymiser, + anonymise_centroid_file, + anonymise_ini_file, + anonymise_output_folder, + anonymise_trajectory_log, +) + +# --------------------------------------------------------------------------- +# Helper: create a minimal synthetic DICOM file +# --------------------------------------------------------------------------- + +def _make_test_dicom( + directory: Path, + filename: str = "test.DCM", + patient_name: str = "Doe^John", + patient_id: str = "12345678", + patient_birth_date: str = "19800101", + institution_name: str = "Test Hospital", + study_id: str = "STUDY1", + patient_sex: str = "M", + patient_age: str = "044Y", + study_description: str = "CT Head", +) -> Path: + """Create a minimal valid DICOM file for testing.""" + directory.mkdir(parents=True, exist_ok=True) + filepath = directory / filename + + file_meta = pydicom.Dataset() + file_meta.MediaStorageSOPClassUID = "1.2.840.10008.5.1.4.1.1.2" # CT + file_meta.MediaStorageSOPInstanceUID = generate_uid() + file_meta.TransferSyntaxUID = ExplicitVRLittleEndian + + ds = FileDataset(str(filepath), {}, file_meta=file_meta, preamble=b"\x00" * 128) + ds.is_little_endian = True + ds.is_implicit_VR = False + + # Tags that should be REPLACED with anon_id + ds.PatientName = patient_name + ds.PatientID = patient_id + ds.StudyID = study_id + + # Tags that should be CLEARED + ds.PatientBirthDate = patient_birth_date + ds.AccessionNumber = "ACC001" + ds.InstitutionName = institution_name + ds.InstitutionAddress = "123 Test St" + ds.ReferringPhysicianName = "Smith^Alice" + ds.PhysiciansOfRecord = "Jones^Bob" + ds.OperatorsName = "Operator^One" + + # Tags that should be PRESERVED + ds.PatientSex = patient_sex + ds.PatientAge = patient_age + ds.StudyDescription = study_description + + # UIDs that must be preserved + ds.StudyInstanceUID = generate_uid() + ds.SeriesInstanceUID = generate_uid() + ds.SOPInstanceUID = file_meta.MediaStorageSOPInstanceUID + ds.SOPClassUID = file_meta.MediaStorageSOPClassUID + + ds.save_as(filepath) + return filepath + + +# --------------------------------------------------------------------------- +# Tests: anonymise_file +# --------------------------------------------------------------------------- + +class TestAnonymiseFile: + def test_replaces_tags(self, tmp_path): + patient_dir = tmp_path / "patient_00000001" + dcm_path = _make_test_dicom(patient_dir / "CT_SET", "slice.DCM") + output_dir = tmp_path / "output" + + anon = DicomAnonymiser(patient_dir, "PAT01", output_dir) + out = anon.anonymise_file(dcm_path) + + ds = pydicom.dcmread(out) + assert str(ds.PatientName) == "PAT01^" + assert ds.PatientID == "PAT01" + assert ds.StudyID == "PAT01" + + def test_clears_tags(self, tmp_path): + patient_dir = tmp_path / "patient_00000001" + dcm_path = _make_test_dicom(patient_dir / "CT_SET", "slice.DCM") + output_dir = tmp_path / "output" + + anon = DicomAnonymiser(patient_dir, "PAT01", output_dir) + out = anon.anonymise_file(dcm_path) + + ds = pydicom.dcmread(out) + assert ds.PatientBirthDate == "" + assert ds.AccessionNumber == "" + assert ds.InstitutionName == "" + assert ds.InstitutionAddress == "" + assert str(ds.ReferringPhysicianName) == "" + assert str(ds.PhysiciansOfRecord) == "" + assert str(ds.OperatorsName) == "" + + def test_preserves_uids(self, tmp_path): + patient_dir = tmp_path / "patient_00000001" + dcm_path = _make_test_dicom(patient_dir / "CT_SET", "slice.DCM") + output_dir = tmp_path / "output" + + original = pydicom.dcmread(dcm_path) + orig_study_uid = original.StudyInstanceUID + orig_series_uid = original.SeriesInstanceUID + orig_sop_uid = original.SOPInstanceUID + + anon = DicomAnonymiser(patient_dir, "PAT01", output_dir) + out = anon.anonymise_file(dcm_path) + + ds = pydicom.dcmread(out) + assert ds.StudyInstanceUID == orig_study_uid + assert ds.SeriesInstanceUID == orig_series_uid + assert ds.SOPInstanceUID == orig_sop_uid + + def test_preserves_research_tags(self, tmp_path): + patient_dir = tmp_path / "patient_00000001" + dcm_path = _make_test_dicom( + patient_dir / "CT_SET", "slice.DCM", + patient_sex="F", patient_age="055Y", study_description="Pelvis RT", + ) + output_dir = tmp_path / "output" + + anon = DicomAnonymiser(patient_dir, "PAT02", output_dir) + out = anon.anonymise_file(dcm_path) + + ds = pydicom.dcmread(out) + assert ds.PatientSex == "F" + assert ds.PatientAge == "055Y" + assert ds.StudyDescription == "Pelvis RT" + + def test_missing_optional_tag(self, tmp_path): + """File without InstitutionName in DICOM_TAGS_CLEAR doesn't crash.""" + patient_dir = tmp_path / "patient_00000001" + dcm_path = _make_test_dicom(patient_dir / "CT_SET", "slice.DCM") + + # Remove InstitutionName from the file before anonymising + ds = pydicom.dcmread(dcm_path) + del ds.InstitutionName + ds.save_as(dcm_path) + + output_dir = tmp_path / "output" + anon = DicomAnonymiser(patient_dir, "PAT01", output_dir) + out = anon.anonymise_file(dcm_path) + + result = pydicom.dcmread(out) + assert str(result.PatientName) == "PAT01^" + # InstitutionName should still be absent (not created) + assert (0x0008, 0x0080) not in result + + +# --------------------------------------------------------------------------- +# Tests: anonymise_ct_set / anonymise_plan +# --------------------------------------------------------------------------- + +class TestAnonymiseCtSet: + def test_anonymises_all_files(self, tmp_path): + patient_dir = tmp_path / "patient_00000001" + _make_test_dicom(patient_dir / "CT_SET", "slice1.DCM") + _make_test_dicom(patient_dir / "CT_SET", "slice2.dcm") # lowercase ext + output_dir = tmp_path / "output" + + anon = DicomAnonymiser(patient_dir, "PAT01", output_dir) + results = anon.anonymise_ct_set() + + assert len(results) == 2 + for p in results: + ds = pydicom.dcmread(p) + assert ds.PatientID == "PAT01" + + +class TestAnonymisePlan: + def test_anonymises_all_files(self, tmp_path): + patient_dir = tmp_path / "patient_00000001" + _make_test_dicom(patient_dir / "DICOM_PLAN", "plan.DCM") + output_dir = tmp_path / "output" + + anon = DicomAnonymiser(patient_dir, "PAT01", output_dir) + results = anon.anonymise_plan() + + assert len(results) == 1 + ds = pydicom.dcmread(results[0]) + assert ds.PatientID == "PAT01" + + +# --------------------------------------------------------------------------- +# Tests: anonymise_all +# --------------------------------------------------------------------------- + +class TestAnonymiseAll: + def test_summary_counts(self, tmp_path): + patient_dir = tmp_path / "patient_00000001" + _make_test_dicom(patient_dir / "CT_SET", "s1.DCM") + _make_test_dicom(patient_dir / "CT_SET", "s2.DCM") + _make_test_dicom(patient_dir / "DICOM_PLAN", "plan.DCM") + output_dir = tmp_path / "output" + + anon = DicomAnonymiser(patient_dir, "PAT03", output_dir) + summary = anon.anonymise_all() + + assert summary == {"ct_count": 2, "plan_count": 1, "anon_id": "PAT03"} + + +# --------------------------------------------------------------------------- +# Tests: edge cases +# --------------------------------------------------------------------------- + +class TestPatientNameFormat: + def test_patient_name_with_site(self, tmp_path): + """PatientName set to AnonID^SiteName when site_name provided.""" + patient_dir = tmp_path / "patient_00000001" + dcm_path = _make_test_dicom(patient_dir / "CT_SET", "slice.DCM") + output_dir = tmp_path / "output" + + anon = DicomAnonymiser(patient_dir, "PAT01", output_dir, site_name="Prostate") + out = anon.anonymise_file(dcm_path) + + ds = pydicom.dcmread(out) + assert str(ds.PatientName) == "PAT01^Prostate" + + def test_patient_name_without_site(self, tmp_path): + """PatientName set to AnonID^ when site_name is empty.""" + patient_dir = tmp_path / "patient_00000001" + dcm_path = _make_test_dicom(patient_dir / "CT_SET", "slice.DCM") + output_dir = tmp_path / "output" + + anon = DicomAnonymiser(patient_dir, "PAT01", output_dir) + out = anon.anonymise_file(dcm_path) + + ds = pydicom.dcmread(out) + assert str(ds.PatientName) == "PAT01^" + + +class TestAnonymiseAllDcm: + def test_recursive_discovery(self, tmp_path): + """anonymise_all_dcm finds .dcm files in nested directories.""" + patient_dir = tmp_path / "patient_00000001" + patient_dir.mkdir() + source = tmp_path / "tps_export" + # Create files in various nested dirs + _make_test_dicom(source / "CT_SET", "slice1.DCM") + _make_test_dicom(source / "DICOM_PLAN", "plan.dcm") + _make_test_dicom(source / "deep" / "nested", "struct.dcm") + output_dir = tmp_path / "output" + + anon = DicomAnonymiser(patient_dir, "PAT01", output_dir, site_name="Brain") + results = anon.anonymise_all_dcm(source) + + assert len(results) == 3 + for p in results: + ds = pydicom.dcmread(p) + assert str(ds.PatientName) == "PAT01^Brain" + assert ds.PatientID == "PAT01" + + def test_empty_source_dir(self, tmp_path): + patient_dir = tmp_path / "patient_00000001" + patient_dir.mkdir() + source = tmp_path / "empty_source" + source.mkdir() + output_dir = tmp_path / "output" + + anon = DicomAnonymiser(patient_dir, "PAT01", output_dir) + results = anon.anonymise_all_dcm(source) + assert results == [] + + def test_nonexistent_source_dir(self, tmp_path): + patient_dir = tmp_path / "patient_00000001" + patient_dir.mkdir() + + anon = DicomAnonymiser(patient_dir, "PAT01", tmp_path / "output") + results = anon.anonymise_all_dcm(tmp_path / "does_not_exist") + assert results == [] + + +class TestFilenameAnonymised: + def test_parenthesised_name_replaced(self, tmp_path): + """Parenthesised patient name in filename replaced with anon_id.""" + patient_dir = tmp_path / "patient_00000001" + dcm_path = _make_test_dicom( + patient_dir / "DICOM_PLAN", + "DCMRT_Plan(SMITH JOHN).dcm", + patient_name="SMITH JOHN", + ) + output_dir = tmp_path / "output" + + anon = DicomAnonymiser(patient_dir, "PAT01", output_dir) + out = anon.anonymise_file(dcm_path) + + assert out.name == "DCMRT_Plan(PAT01).dcm" + assert "SMITH" not in out.name + + def test_no_parens_unchanged(self, tmp_path): + """Filenames without parentheses are unchanged.""" + patient_dir = tmp_path / "patient_00000001" + dcm_path = _make_test_dicom(patient_dir / "CT_SET", "slice001.DCM") + output_dir = tmp_path / "output" + + anon = DicomAnonymiser(patient_dir, "PAT01", output_dir) + out = anon.anonymise_file(dcm_path) + + assert out.name == "slice001.DCM" + + +class TestAnonymiseFileCustomSourceBase: + def test_relative_path_from_custom_base(self, tmp_path): + """source_base parameter controls relative path computation.""" + patient_dir = tmp_path / "patient_00000001" + patient_dir.mkdir() + # Source is outside the patient dir + tps_root = tmp_path / "tps_export" + dcm_path = _make_test_dicom(tps_root / "sub" / "CT", "slice.DCM") + output_dir = tmp_path / "output" + + anon = DicomAnonymiser(patient_dir, "PAT01", output_dir) + out = anon.anonymise_file(dcm_path, source_base=tps_root) + + # Output should mirror the relative path from tps_root + assert out == output_dir / "sub" / "CT" / "slice.DCM" + assert out.exists() + + +class TestAnonymiseFramesXml: + _SAMPLE_XML = """\ + + + + JOHN + SMITH + 12345678 + + + Prostate + Plan for 12345678 prostate treatment + + + 4ee Pelvis Soft S20 + 1.3.46.001 + + +""" + + def test_anonymise_frames_xml(self, tmp_path): + """Patient name, ID replaced; MRN scrubbed from description.""" + patient_dir = tmp_path / "patient_12345678" + patient_dir.mkdir() + xml_file = patient_dir / "_Frames.xml" + xml_file.write_text(self._SAMPLE_XML, encoding="utf-8") + + output_path = tmp_path / "out" / "_Frames.xml" + anon = DicomAnonymiser(patient_dir, "PRIME001", tmp_path / "staging") + result = anon.anonymise_frames_xml(xml_file, output_path) + + assert result == output_path + assert output_path.exists() + + tree = ET.parse(output_path) + root = tree.getroot() + + patient = root.find("Patient") + assert patient.find("FirstName").text == "" or patient.find("FirstName").text is None + assert patient.find("LastName").text == "PRIME001" + assert patient.find("ID").text == "PRIME001" + + desc = root.find("Treatment").find("Description").text + assert "12345678" not in desc + assert "PRIME001" in desc + + # Non-PII tags unchanged + assert root.find("Treatment").find("ID").text == "Prostate" + assert root.find("Image").find("DicomUID").text == "1.3.46.001" + + def test_anonymise_frames_xml_missing_tags(self, tmp_path): + """Gracefully handles XML without Patient element.""" + patient_dir = tmp_path / "patient_00000001" + patient_dir.mkdir() + minimal_xml = """\ + + + Brain + preset + +""" + xml_file = patient_dir / "_Frames.xml" + xml_file.write_text(minimal_xml, encoding="utf-8") + + output_path = tmp_path / "out" / "_Frames.xml" + anon = DicomAnonymiser(patient_dir, "PAT01", tmp_path / "staging") + result = anon.anonymise_frames_xml(xml_file, output_path) + + assert result == output_path + assert output_path.exists() + + tree = ET.parse(output_path) + root = tree.getroot() + # Patient element absent — should not crash + assert root.find("Patient") is None + assert root.find("Treatment").find("ID").text == "Brain" + + +class TestEdgeCases: + def test_missing_ct_set_dir(self, tmp_path): + patient_dir = tmp_path / "patient_00000001" + patient_dir.mkdir() + output_dir = tmp_path / "output" + + anon = DicomAnonymiser(patient_dir, "PAT01", output_dir) + assert anon.anonymise_ct_set() == [] + + def test_missing_patient_dir_raises(self, tmp_path): + with pytest.raises(FileNotFoundError): + DicomAnonymiser(tmp_path / "nonexistent", "PAT01", tmp_path / "out") + + def test_output_dir_created(self, tmp_path): + patient_dir = tmp_path / "patient_00000001" + _make_test_dicom(patient_dir / "CT_SET", "s1.DCM") + output_dir = tmp_path / "deeply" / "nested" / "output" + + assert not output_dir.exists() + + anon = DicomAnonymiser(patient_dir, "PAT01", output_dir) + anon.anonymise_ct_set() + + assert output_dir.exists() + assert (output_dir / "CT_SET" / "s1.DCM").exists() + + +# --------------------------------------------------------------------------- +# Tests: anonymise_ini_file +# --------------------------------------------------------------------------- + +class TestAnonymiseIniFile: + def test_replaces_pii_fields(self, tmp_path): + ini = tmp_path / "recon.INI" + ini.write_text( + "PatientID=12345678\n" + "FirstName=JOHN\n" + "LastName=SMITH\n" + "VoxelSize=1.0\n" + "SliceThickness=2.5\n", + encoding="utf-8", + ) + + anonymise_ini_file(ini, "PAT01") + + text = ini.read_text(encoding="utf-8") + assert "PatientID=PAT01" in text + assert "FirstName=" in text and "FirstName=JOHN" not in text + assert "LastName=PAT01" in text + # Non-PII lines unchanged + assert "VoxelSize=1.0" in text + assert "SliceThickness=2.5" in text + + def test_ini_xvi_extension(self, tmp_path): + ini = tmp_path / "recon.INI.XVI" + ini.write_text( + "PatientID=87654321\n" + "FirstName=JANE\n" + "LastName=DOE\n" + "Rows=512\n", + encoding="utf-8", + ) + + anonymise_ini_file(ini, "PAT02") + + text = ini.read_text(encoding="utf-8") + assert "PatientID=PAT02" in text + assert "FirstName=JANE" not in text + assert "LastName=PAT02" in text + assert "Rows=512" in text + + +# --------------------------------------------------------------------------- +# Tests: anonymise_centroid_file +# --------------------------------------------------------------------------- + +class TestAnonymiseCentroidFile: + def test_replaces_first_two_lines(self, tmp_path): + f = tmp_path / "centroid.txt" + f.write_text("12345678\nSMITH JOHN\ndata line\n", encoding="utf-8") + + result = anonymise_centroid_file(f, "PAT01") + + lines = result.read_text(encoding="utf-8").splitlines() + assert lines[0] == "PAT01" + assert lines[1] == "PAT01" + assert lines[2] == "data line" + + def test_renames_file_with_mrn(self, tmp_path): + f = tmp_path / "Centroid_12345678.txt" + f.write_text("12345678\nSMITH JOHN\ncoords\n", encoding="utf-8") + + result = anonymise_centroid_file(f, "PAT01") + + assert result.name == "Centroid_PAT01.txt" + assert result.exists() + assert not f.exists() + + def test_no_rename_without_mrn(self, tmp_path): + f = tmp_path / "centroid.txt" + f.write_text("12345678\nSMITH JOHN\ncoords\n", encoding="utf-8") + + result = anonymise_centroid_file(f, "PAT01") + + assert result == f + assert result.exists() + + +# --------------------------------------------------------------------------- +# Tests: anonymise_trajectory_log +# --------------------------------------------------------------------------- + +class TestAnonymiseTrajectoryLog: + def test_replaces_patient_id(self, tmp_path): + f = tmp_path / "MarkerLocations01.txt" + f.write_text( + "path=patient_12345678/data\n" + "ref=patient_12345678\n" + "other line\n", + encoding="utf-8", + ) + + anonymise_trajectory_log(f, "12345678", "PAT01") + + text = f.read_text(encoding="utf-8") + assert "patient_PAT01" in text + assert "patient_12345678" not in text + assert "other line" in text + + def test_empty_original_id_no_change(self, tmp_path): + f = tmp_path / "MarkerLocations01.txt" + original = "path=patient_12345678/data\nother\n" + f.write_text(original, encoding="utf-8") + + anonymise_trajectory_log(f, "", "PAT01") + + assert f.read_text(encoding="utf-8") == original + + +# --------------------------------------------------------------------------- +# Tests: anonymise_output_folder +# --------------------------------------------------------------------------- + +_FRAMES_XML_PII = """\ + + + + JOHN + SMITH + 12345678 + + Prostate + +""" + + +def _build_output_tree(tmp_path): + """Build a realistic output folder structure for anonymise_output_folder tests. + + Returns (output_dir, patient_dir, site_name). + """ + site_name = "Prostate" + output_dir = tmp_path / "output" + site_root = output_dir / site_name + + # Patient Images — CBCT with _Frames.xml, INI, .his + cbct = site_root / "Patient Images" / "PAT01" / "FX1" / "CBCT" / "CBCT1" + cbct.mkdir(parents=True) + (cbct / "_Frames.xml").write_text(_FRAMES_XML_PII, encoding="utf-8") + + recon = cbct / "Reconstructed CBCT" + recon.mkdir() + (recon / "recon.INI").write_text( + "PatientID=12345678\nFirstName=JOHN\nLastName=SMITH\nRows=512\n", + encoding="utf-8", + ) + + proj = cbct / "CBCT Projections" / "IPS" + proj.mkdir(parents=True) + (proj / "00001.his").write_bytes(b"\x00\x01\x02\x03") + + # Patient Files — centroid + pf = site_root / "Patient Files" / "PAT01" + pf.mkdir(parents=True) + (pf / "Centroid_12345678.txt").write_text( + "12345678\nSMITH JOHN\ncoords\n", encoding="utf-8" + ) + + # KIM-KV — trajectory log + kim = site_root / "KIM-KV" / "img_session" + kim.mkdir(parents=True) + (kim / "MarkerLocations01.txt").write_text( + "path=patient_12345678/data\n", encoding="utf-8" + ) + + # Source patient dir (must exist for DicomAnonymiser) + patient_dir = tmp_path / "patient_12345678" + patient_dir.mkdir() + + return output_dir, patient_dir, site_name + + +class TestAnonymiseOutputFolder: + def test_anonymises_all_file_types(self, tmp_path): + output_dir, patient_dir, site_name = _build_output_tree(tmp_path) + + counts = anonymise_output_folder( + output_dir=output_dir, + anon_id="PAT01", + site_name=site_name, + patient_dir=patient_dir, + ) + + assert counts["xml"] == 1 + assert counts["ini"] == 1 + assert counts["centroid"] == 1 + assert counts["trajectory"] == 1 + assert counts["errors"] == 0 + + site_root = output_dir / site_name + + # _Frames.xml anonymised + xml_path = ( + site_root / "Patient Images" / "PAT01" / "FX1" / "CBCT" / "CBCT1" + / "_Frames.xml" + ) + xml_text = xml_path.read_text(encoding="utf-8") + assert "12345678" not in xml_text + assert "JOHN" not in xml_text + + # INI anonymised + ini_path = ( + site_root / "Patient Images" / "PAT01" / "FX1" / "CBCT" / "CBCT1" + / "Reconstructed CBCT" / "recon.INI" + ) + ini_text = ini_path.read_text(encoding="utf-8") + assert "PatientID=PAT01" in ini_text + assert "12345678" not in ini_text + + # Centroid anonymised and renamed + centroid_dir = site_root / "Patient Files" / "PAT01" + centroid_files = list(centroid_dir.glob("Centroid_*.txt")) + assert len(centroid_files) == 1 + assert "PAT01" in centroid_files[0].name + centroid_text = centroid_files[0].read_text(encoding="utf-8") + assert centroid_text.splitlines()[0] == "PAT01" + + # Trajectory log anonymised + traj_path = ( + site_root / "KIM-KV" / "img_session" / "MarkerLocations01.txt" + ) + traj_text = traj_path.read_text(encoding="utf-8") + assert "patient_PAT01" in traj_text + assert "patient_12345678" not in traj_text + + # .his file untouched + his_path = ( + site_root / "Patient Images" / "PAT01" / "FX1" / "CBCT" / "CBCT1" + / "CBCT Projections" / "IPS" / "00001.his" + ) + assert his_path.read_bytes() == b"\x00\x01\x02\x03" + + def test_tps_import(self, tmp_path): + output_dir, patient_dir, site_name = _build_output_tree(tmp_path) + + # Create a TPS export with a DICOM CT file + tps = tmp_path / "tps_export" + _make_test_dicom(tps / "DICOM CT Images", "ct_slice.dcm") + + counts = anonymise_output_folder( + output_dir=output_dir, + anon_id="PAT01", + site_name=site_name, + patient_dir=patient_dir, + tps_path=tps, + ) + + assert counts["tps_imported"] == 1 + + ct_dir = output_dir / site_name / "Patient Plans" / "PAT01" / "CT" + dcm_files = list(ct_dir.rglob("*.dcm")) + list(ct_dir.rglob("*.DCM")) + assert len(dcm_files) >= 1 + ds = pydicom.dcmread(dcm_files[0]) + assert ds.PatientID == "PAT01" + + def test_progress_callback(self, tmp_path): + output_dir, patient_dir, site_name = _build_output_tree(tmp_path) + + calls = [] + + def on_progress(current, total, filename): + calls.append((current, total, filename)) + + anonymise_output_folder( + output_dir=output_dir, + anon_id="PAT01", + site_name=site_name, + patient_dir=patient_dir, + progress_callback=on_progress, + ) + + assert len(calls) > 0 + # Final call should have current == total + assert calls[-1][0] == calls[-1][1] diff --git a/tests/test_folder_sort.py b/tests/test_folder_sort.py new file mode 100644 index 0000000..f241709 --- /dev/null +++ b/tests/test_folder_sort.py @@ -0,0 +1,839 @@ +"""Tests for learn_upload.folder_sort — session discovery, fraction assignment, file copying.""" + +import textwrap +import xml.etree.ElementTree as ET +from datetime import datetime +from pathlib import Path +from unittest.mock import patch + +import pydicom +from pydicom.dataset import FileDataset +from pydicom.uid import ExplicitVRLittleEndian, generate_uid + +from learn_upload.folder_sort import ( + CBCTSession, + LearnFolderMapper, + classify_acquisition, +) + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_frames_xml( + treatment_id: str = "Prostate", + preset_name: str = "4ee Pelvis Soft S20 179-181", + dicom_uid: str = "1.3.46.423632.33783920233217242713.500", + kv: str = "120.0", + ma: str = "25.5", + patient_id: str = "", + patient_first_name: str = "", + patient_last_name: str = "", +) -> str: + """Generate a _Frames.xml content string.""" + patient_block = "" + if patient_id or patient_first_name or patient_last_name: + patient_block = ( + " \n" + f" {patient_first_name}\n" + f" {patient_last_name}\n" + f" {patient_id}\n" + " \n" + ) + return ( + '\n' + "\n" + f"{patient_block}" + " \n" + f" {treatment_id}\n" + " \n" + " \n" + f" {preset_name}\n" + f" {dicom_uid}\n" + f" {kv}\n" + f" {ma}\n" + " \n" + " \n" + ' \n' + " \n" + "\n" + ) + + +def _make_xvi_session( + tmp_path: Path, + img_name: str, + preset_name: str = "4ee Pelvis Soft S20 179-181", + treatment_id: str = "Prostate", + dicom_uid: str = "1.3.46.423632.33783920233217242713.500", + scan_uid: str = "1.3.46.423632.33783920233217242713.224.2023-03-21165402768", + num_his: int = 2, + with_reconstruction: bool = True, + with_rps: bool = False, +) -> Path: + """Create a synthetic img_* directory with _Frames.xml and optional files.""" + patient_dir = tmp_path / "patient_12345" + images_dir = patient_dir / "IMAGES" + img_dir = images_dir / img_name + img_dir.mkdir(parents=True, exist_ok=True) + + # _Frames.xml + xml_content = _make_frames_xml( + treatment_id=treatment_id, + preset_name=preset_name, + dicom_uid=dicom_uid, + ) + (img_dir / "_Frames.xml").write_text(xml_content, encoding="utf-8") + + # .his files + for i in range(num_his): + (img_dir / f"frame_{i:04d}.his").write_bytes(b"\x00" * 100) + + # Reconstruction directory + if with_reconstruction: + recon_dir = img_dir / "Reconstruction" + recon_dir.mkdir() + ini_content = textwrap.dedent(f"""\ + [IDENTIFICATION] + PatientID=12345 + TreatmentID={treatment_id} + ScanUID={scan_uid} + + [RECONSTRUCTION] + TubeKV=120.0 + TubeMA=25.5 + """) + (recon_dir / "recon.INI").write_text(ini_content, encoding="utf-8") + (recon_dir / "volume.SCAN").write_bytes(b"\x00" * 200) + (recon_dir / "volume.SCAN.MACHINEORIENTATION").write_bytes(b"\x00" * 50) + + return patient_dir + + +def _make_modality_dcm(directory: Path, filename: str, modality: str) -> Path: + """Create a minimal DICOM file with a specific Modality tag.""" + directory.mkdir(parents=True, exist_ok=True) + filepath = directory / filename + + file_meta = pydicom.Dataset() + file_meta.MediaStorageSOPClassUID = "1.2.840.10008.5.1.4.1.1.2" + file_meta.MediaStorageSOPInstanceUID = generate_uid() + file_meta.TransferSyntaxUID = ExplicitVRLittleEndian + + ds = FileDataset(str(filepath), {}, file_meta=file_meta, preamble=b"\x00" * 128) + ds.is_little_endian = True + ds.is_implicit_VR = False + ds.Modality = modality + ds.PatientName = "Test^Patient" + ds.PatientID = "12345" + ds.SOPInstanceUID = file_meta.MediaStorageSOPInstanceUID + ds.SOPClassUID = file_meta.MediaStorageSOPClassUID + + ds.save_as(filepath) + return filepath + + +# --------------------------------------------------------------------------- +# classify_dicom_files +# --------------------------------------------------------------------------- + +class TestClassifyDicomFiles: + def test_classifies_by_modality(self, tmp_path): + """CT, RTPLAN, RTSTRUCT, RTDOSE correctly classified.""" + source = tmp_path / "tps_export" + _make_modality_dcm(source / "CT_SET", "slice1.dcm", "CT") + _make_modality_dcm(source / "CT_SET", "slice2.dcm", "CT") + _make_modality_dcm(source / "DICOM_PLAN", "plan.dcm", "RTPLAN") + _make_modality_dcm(source / "Structures", "struct.dcm", "RTSTRUCT") + _make_modality_dcm(source / "Dose", "dose.dcm", "RTDOSE") + + result = LearnFolderMapper.classify_dicom_files(source) + + assert len(result["ct"]) == 2 + assert len(result["plan"]) == 1 + assert len(result["structures"]) == 1 + assert len(result["dose"]) == 1 + + def test_unknown_modality_excluded(self, tmp_path): + """Unrecognised modality logged, not in results.""" + source = tmp_path / "tps_export" + _make_modality_dcm(source, "mystery.dcm", "MR") + _make_modality_dcm(source, "ct.dcm", "CT") + + result = LearnFolderMapper.classify_dicom_files(source) + + assert len(result["ct"]) == 1 + # MR should not appear in any category + all_files = result["ct"] + result["plan"] + result["structures"] + result["dose"] + assert len(all_files) == 1 + + def test_empty_dir(self, tmp_path): + source = tmp_path / "empty" + source.mkdir() + result = LearnFolderMapper.classify_dicom_files(source) + assert all(len(v) == 0 for v in result.values()) + + def test_nonexistent_dir(self, tmp_path): + result = LearnFolderMapper.classify_dicom_files(tmp_path / "nope") + assert all(len(v) == 0 for v in result.values()) + + def test_recursive_discovery(self, tmp_path): + """Files in deeply nested directories are found.""" + source = tmp_path / "export" + _make_modality_dcm(source / "a" / "b" / "c", "deep.dcm", "RTDOSE") + + result = LearnFolderMapper.classify_dicom_files(source) + assert len(result["dose"]) == 1 + + +# --------------------------------------------------------------------------- +# classify_acquisition +# --------------------------------------------------------------------------- + +class TestClassifyAcquisition: + def test_cbct(self): + assert classify_acquisition("4ee Pelvis Soft S20 179-181") == "cbct" + + def test_kim_learning(self): + assert classify_acquisition("12aa KIM S20 R 34-181") == "kim_learning" + + def test_motionview(self): + assert classify_acquisition("13a KIM S20 MotionView") == "motionview" + + def test_case_insensitive(self): + assert classify_acquisition("KIM motionview preset") == "motionview" + assert classify_acquisition("kim learning preset") == "kim_learning" + + +# --------------------------------------------------------------------------- +# discover_sessions +# --------------------------------------------------------------------------- + +class TestDiscoverSessions: + @patch("learn_upload.folder_sort.extract_ini_from_rps", return_value=None) + def test_discover_sessions_basic(self, mock_rps, tmp_path): + patient_dir = _make_xvi_session( + tmp_path, "img_001", + scan_uid="1.3.46.423632.12345.2023-03-21100000000", + dicom_uid="1.3.46.001", + ) + # Add second session + img2 = patient_dir / "IMAGES" / "img_002" + img2.mkdir() + xml2 = _make_frames_xml( + dicom_uid="1.3.46.002", + preset_name="4ee Pelvis Soft S20 179-181", + ) + (img2 / "_Frames.xml").write_text(xml2, encoding="utf-8") + recon2 = img2 / "Reconstruction" + recon2.mkdir() + ini2 = "[IDENTIFICATION]\nScanUID=1.3.46.423632.12345.2023-03-22110000000\n" + (recon2 / "recon.INI").write_text(ini2, encoding="utf-8") + + mapper = LearnFolderMapper(patient_dir, "PAT01", "Prostate", tmp_path / "out") + sessions = mapper.discover_sessions() + + assert len(sessions) == 2 + assert sessions[0].scan_datetime < sessions[1].scan_datetime + + @patch("learn_upload.folder_sort.extract_ini_from_rps", return_value=None) + def test_discover_sessions_skips_missing_xml(self, mock_rps, tmp_path): + patient_dir = _make_xvi_session(tmp_path, "img_001") + # Create a dir with no _Frames.xml + no_xml_dir = patient_dir / "IMAGES" / "img_999" + no_xml_dir.mkdir() + + mapper = LearnFolderMapper(patient_dir, "PAT01", "Prostate", tmp_path / "out") + sessions = mapper.discover_sessions() + assert len(sessions) == 1 + + @patch("learn_upload.folder_sort.extract_ini_from_rps", return_value=None) + def test_discover_motionview_session(self, mock_rps, tmp_path): + patient_dir = _make_xvi_session( + tmp_path, "img_mv", + preset_name="13a KIM S20 MotionView", + with_reconstruction=False, + ) + + mapper = LearnFolderMapper(patient_dir, "PAT01", "Prostate", tmp_path / "out") + sessions = mapper.discover_sessions() + + assert len(sessions) == 1 + assert sessions[0].session_type == "motionview" + assert sessions[0].scan_datetime is None + + def test_missing_images_dir(self, tmp_path): + patient_dir = tmp_path / "patient_empty" + patient_dir.mkdir() + + mapper = LearnFolderMapper(patient_dir, "PAT01", "Prostate", tmp_path / "out") + sessions = mapper.discover_sessions() + assert sessions == [] + + +# --------------------------------------------------------------------------- +# _match_motionview_dates +# --------------------------------------------------------------------------- + +class TestMatchMotionviewDates: + def test_match_motionview_dates(self, tmp_path): + dated_dt = datetime(2023, 3, 21, 10, 0, 0) + dated = CBCTSession( + img_dir=tmp_path / "img_001", + dicom_uid="1.3.46.423632.ABCDEF.001", + acquisition_preset="4ee Pelvis", + session_type="cbct", + treatment_id="Prostate", + scan_datetime=dated_dt, + ) + undated = CBCTSession( + img_dir=tmp_path / "img_mv", + dicom_uid="1.3.46.423632.ABCDEF.999", + acquisition_preset="13a KIM MotionView", + session_type="motionview", + treatment_id="Prostate", + scan_datetime=None, + ) + + mapper = LearnFolderMapper(tmp_path, "PAT01", "Prostate", tmp_path / "out") + mapper._match_motionview_dates([dated], [undated]) + + assert undated.scan_datetime == dated_dt + + +# --------------------------------------------------------------------------- +# assign_fractions +# --------------------------------------------------------------------------- + +class TestAssignFractions: + def _make_session(self, tmp_path, dt, session_type="cbct", name="img_001"): + return CBCTSession( + img_dir=tmp_path / name, + dicom_uid=f"uid_{name}", + acquisition_preset="preset", + session_type=session_type, + treatment_id="Prostate", + scan_datetime=dt, + ) + + def test_assign_fractions_chronological(self, tmp_path): + s1 = self._make_session(tmp_path, datetime(2023, 3, 21, 10, 0), name="img_001") + s2 = self._make_session(tmp_path, datetime(2023, 3, 22, 10, 0), name="img_002") + s3 = self._make_session(tmp_path, datetime(2023, 3, 23, 10, 0), name="img_003") + + mapper = LearnFolderMapper(tmp_path, "PAT01", "Prostate", tmp_path / "out") + fractions = mapper.assign_fractions([s1, s2, s3]) + + assert list(fractions.keys()) == ["FX1", "FX2", "FX3"] + assert fractions["FX1"] == [s1] + assert fractions["FX2"] == [s2] + assert fractions["FX3"] == [s3] + + def test_assign_fractions_same_day(self, tmp_path): + s1 = self._make_session(tmp_path, datetime(2023, 3, 21, 10, 0), name="img_001") + s2 = self._make_session(tmp_path, datetime(2023, 3, 21, 14, 0), name="img_002") + + mapper = LearnFolderMapper(tmp_path, "PAT01", "Prostate", tmp_path / "out") + fractions = mapper.assign_fractions([s1, s2]) + + assert list(fractions.keys()) == ["FX1"] + assert len(fractions["FX1"]) == 2 + + def test_assign_fractions_with_motionview(self, tmp_path): + s1 = self._make_session(tmp_path, datetime(2023, 3, 21, 10, 0), name="img_001") + mv = self._make_session( + tmp_path, datetime(2023, 3, 21, 10, 5), + session_type="motionview", name="img_mv", + ) + + mapper = LearnFolderMapper(tmp_path, "PAT01", "Prostate", tmp_path / "out") + fractions = mapper.assign_fractions([s1, mv]) + + assert "FX1" in fractions + assert len(fractions["FX1"]) == 2 + types = {s.session_type for s in fractions["FX1"]} + assert types == {"cbct", "motionview"} + + +# --------------------------------------------------------------------------- +# create_learn_structure +# --------------------------------------------------------------------------- + +class TestCreateLearnStructure: + def test_create_learn_structure(self, tmp_path): + s1 = CBCTSession( + img_dir=tmp_path / "img_001", + dicom_uid="uid1", + acquisition_preset="4ee Pelvis", + session_type="cbct", + treatment_id="Prostate", + scan_datetime=datetime(2023, 3, 21, 10, 0), + ) + s2 = CBCTSession( + img_dir=tmp_path / "img_002", + dicom_uid="uid2", + acquisition_preset="4ee Pelvis", + session_type="cbct", + treatment_id="Prostate", + scan_datetime=datetime(2023, 3, 21, 14, 0), + ) + + fraction_map = {"FX1": [s1, s2]} + mapper = LearnFolderMapper(tmp_path, "PAT01", "Prostate", tmp_path / "out") + site_root = mapper.create_learn_structure(fraction_map) + + # Verify key directories exist + assert (site_root / "Patient Files" / "PAT01").is_dir() + assert (site_root / "Patient Plans" / "PAT01" / "CT").is_dir() + assert (site_root / "Patient Plans" / "PAT01" / "Plan").is_dir() + assert (site_root / "Patient Plans" / "PAT01" / "Dose").is_dir() + assert (site_root / "Patient Plans" / "PAT01" / "Structure Set").is_dir() + assert (site_root / "Ground Truth" / "PAT01").is_dir() + + # Verify fraction structure + fx1 = site_root / "Patient Images" / "PAT01" / "FX1" + assert (fx1 / "CBCT" / "CBCT1" / "CBCT Projections" / "IPS").is_dir() + assert (fx1 / "CBCT" / "CBCT1" / "CBCT Projections" / "CDOG").is_dir() + assert (fx1 / "CBCT" / "CBCT1" / "Reconstructed CBCT").is_dir() + assert (fx1 / "CBCT" / "CBCT1" / "Registration file").is_dir() + assert (fx1 / "CBCT" / "CBCT2" / "CBCT Projections" / "IPS").is_dir() + assert (fx1 / "KIM-KV").is_dir() + + +# --------------------------------------------------------------------------- +# copy_cbct_files +# --------------------------------------------------------------------------- + +class TestCopyCbctFiles: + def test_copy_cbct_files(self, tmp_path): + # Set up source session + patient_dir = _make_xvi_session(tmp_path, "img_001", num_his=3) + img_dir = patient_dir / "IMAGES" / "img_001" + + session = CBCTSession( + img_dir=img_dir, + dicom_uid="uid1", + acquisition_preset="4ee Pelvis", + session_type="cbct", + treatment_id="Prostate", + has_rps=False, + rps_path=None, + ) + + # Set up destination + cbct_path = tmp_path / "dest" / "CBCT1" + cbct_path.mkdir(parents=True) + + mapper = LearnFolderMapper(patient_dir, "PAT01", "Prostate", tmp_path / "out") + counts = mapper.copy_cbct_files(session, cbct_path) + + assert counts["his"] == 3 + assert counts["scan"] == 2 # .SCAN + .SCAN.MACHINEORIENTATION + assert counts["rps"] == 0 + + # Verify files exist + ips_files = list((cbct_path / "CBCT Projections" / "IPS").glob("*.his")) + assert len(ips_files) == 3 + + def test_ini_files_copied(self, tmp_path): + """INI and INI.XVI files from Reconstruction/ copied to Reconstructed CBCT/.""" + patient_dir = _make_xvi_session(tmp_path, "img_ini", num_his=1) + img_dir = patient_dir / "IMAGES" / "img_ini" + recon_dir = img_dir / "Reconstruction" + # _make_xvi_session already creates recon.INI — add an .INI.XVI too + (recon_dir / "recon.INI.XVI").write_text("xvi config", encoding="utf-8") + + session = CBCTSession( + img_dir=img_dir, + dicom_uid="uid_ini", + acquisition_preset="4ee Pelvis", + session_type="cbct", + treatment_id="Prostate", + has_rps=False, + rps_path=None, + ) + + cbct_path = tmp_path / "dest" / "CBCT1" + cbct_path.mkdir(parents=True) + + mapper = LearnFolderMapper(patient_dir, "PAT01", "Prostate", tmp_path / "out") + counts = mapper.copy_cbct_files(session, cbct_path) + + assert counts["ini"] == 2 # .INI + .INI.XVI + recon_dest = cbct_path / "Reconstructed CBCT" + assert (recon_dest / "recon.INI").exists() + assert (recon_dest / "recon.INI.XVI").exists() + + +# --------------------------------------------------------------------------- +# copy_motionview_files +# --------------------------------------------------------------------------- + +class TestCopyMotionviewFiles: + def test_copy_motionview_files(self, tmp_path): + # Set up MV source + patient_dir = _make_xvi_session( + tmp_path, "img_mv", + preset_name="13a KIM S20 MotionView", + with_reconstruction=False, + num_his=5, + ) + img_dir = patient_dir / "IMAGES" / "img_mv" + + session = CBCTSession( + img_dir=img_dir, + dicom_uid="uid_mv", + acquisition_preset="13a KIM S20 MotionView", + session_type="motionview", + treatment_id="Prostate", + ) + + fx_path = tmp_path / "dest" / "FX0" + fx_path.mkdir(parents=True) + + mapper = LearnFolderMapper(patient_dir, "PAT01", "Prostate", tmp_path / "out") + counts = mapper.copy_motionview_files(session, fx_path) + + assert counts["his"] == 5 + assert counts["frames_xml"] == 1 # _Frames.xml from _make_xvi_session + dest_files = list((fx_path / "KIM-KV" / "img_mv").glob("*.his")) + assert len(dest_files) == 5 + + +# --------------------------------------------------------------------------- +# copy_anonymised_plans +# --------------------------------------------------------------------------- + +class TestCopyAnonymisedPlans: + def test_copy_anonymised_plans(self, tmp_path): + # Set up anonymised source dirs + ct_dir = tmp_path / "anon" / "CT_SET" + ct_dir.mkdir(parents=True) + for i in range(3): + (ct_dir / f"ct_{i}.dcm").write_bytes(b"\x00" * 50) + + plan_dir = tmp_path / "anon" / "DICOM_PLAN" + plan_dir.mkdir(parents=True) + (plan_dir / "plan.dcm").write_bytes(b"\x00" * 50) + + out = tmp_path / "out" + mapper = LearnFolderMapper(tmp_path, "PAT01", "Prostate", out) + counts = mapper.copy_anonymised_plans(ct_dir, plan_dir) + + assert counts["ct_count"] == 3 + assert counts["plan_count"] == 1 + assert counts["structures_count"] == 0 + assert counts["dose_count"] == 0 + + # Verify destination + assert (out / "Prostate" / "Patient Plans" / "PAT01" / "CT" / "ct_0.dcm").exists() + assert (out / "Prostate" / "Patient Plans" / "PAT01" / "Plan" / "plan.dcm").exists() + + def test_copy_all_four_categories(self, tmp_path): + ct_dir = tmp_path / "anon" / "ct" + ct_dir.mkdir(parents=True) + (ct_dir / "ct.dcm").write_bytes(b"\x00" * 50) + + plan_dir = tmp_path / "anon" / "plan" + plan_dir.mkdir(parents=True) + (plan_dir / "plan.dcm").write_bytes(b"\x00" * 50) + + struct_dir = tmp_path / "anon" / "struct" + struct_dir.mkdir(parents=True) + (struct_dir / "struct.dcm").write_bytes(b"\x00" * 50) + + dose_dir = tmp_path / "anon" / "dose" + dose_dir.mkdir(parents=True) + (dose_dir / "dose.dcm").write_bytes(b"\x00" * 50) + (dose_dir / "dose2.dcm").write_bytes(b"\x00" * 50) + + out = tmp_path / "out" + mapper = LearnFolderMapper(tmp_path, "PAT01", "Prostate", out) + counts = mapper.copy_anonymised_plans( + ct_dir, plan_dir, struct_dir, dose_dir + ) + + assert counts == { + "ct_count": 1, + "plan_count": 1, + "structures_count": 1, + "dose_count": 2, + } + + plans_root = out / "Prostate" / "Patient Plans" / "PAT01" + assert (plans_root / "CT" / "ct.dcm").exists() + assert (plans_root / "Plan" / "plan.dcm").exists() + assert (plans_root / "Structure Set" / "struct.dcm").exists() + assert (plans_root / "Dose" / "dose.dcm").exists() + assert (plans_root / "Dose" / "dose2.dcm").exists() + + def test_none_dirs_skipped(self, tmp_path): + """Passing None for all dirs returns all-zero counts.""" + out = tmp_path / "out" + mapper = LearnFolderMapper(tmp_path, "PAT01", "Prostate", out) + counts = mapper.copy_anonymised_plans() + + assert counts == { + "ct_count": 0, + "plan_count": 0, + "structures_count": 0, + "dose_count": 0, + } + + +# --------------------------------------------------------------------------- +# execute +# --------------------------------------------------------------------------- + +class TestExecute: + @patch("learn_upload.folder_sort.extract_ini_from_rps", return_value=None) + def test_execute_dry_run(self, mock_rps, tmp_path): + patient_dir = _make_xvi_session( + tmp_path, "img_001", + scan_uid="1.3.46.423632.12345.2023-03-21100000000", + ) + + out = tmp_path / "out" + mapper = LearnFolderMapper(patient_dir, "PAT01", "Prostate", out) + summary = mapper.execute(dry_run=True) + + assert summary["dry_run"] is True + assert summary["sessions"] == 1 + assert summary["fractions"] == 1 + # Dirs should exist + assert (out / "Prostate" / "Patient Images" / "PAT01" / "FX1" / "CBCT" / "CBCT1").is_dir() + # No files should be copied + ips = out / "Prostate" / "Patient Images" / "PAT01" / "FX1" / "CBCT" / "CBCT1" / "CBCT Projections" / "IPS" + assert list(ips.glob("*.his")) == [] + + @patch("learn_upload.folder_sort.extract_ini_from_rps", return_value=None) + def test_execute_full(self, mock_rps, tmp_path): + patient_dir = _make_xvi_session( + tmp_path, "img_001", + scan_uid="1.3.46.423632.12345.2023-03-21100000000", + num_his=4, + ) + + out = tmp_path / "out" + mapper = LearnFolderMapper(patient_dir, "PAT01", "Prostate", out) + summary = mapper.execute(dry_run=False) + + assert summary["dry_run"] is False + assert summary["sessions"] == 1 + assert summary["fractions"] == 1 + assert summary["files_copied"]["his"] == 4 + assert summary["files_copied"]["scan"] == 2 # .SCAN + .SCAN.MACHINEORIENTATION + + # Verify files actually copied + ips = out / "Prostate" / "Patient Images" / "PAT01" / "FX1" / "CBCT" / "CBCT1" / "CBCT Projections" / "IPS" + assert len(list(ips.glob("*.his"))) == 4 + + +# --------------------------------------------------------------------------- +# _Frames.xml copying +# --------------------------------------------------------------------------- + +class TestFramesXmlCopied: + def test_frames_xml_copied_with_cbct(self, tmp_path): + """_Frames.xml raw-copied to IPS/ alongside .his files (no anonymisation).""" + patient_dir = tmp_path / "patient_12345678" + images_dir = patient_dir / "IMAGES" / "img_001" + images_dir.mkdir(parents=True) + + xml_content = _make_frames_xml( + patient_id="12345678", + patient_first_name="JOHN", + patient_last_name="SMITH", + ) + (images_dir / "_Frames.xml").write_text(xml_content, encoding="utf-8") + (images_dir / "frame_0000.his").write_bytes(b"\x00" * 100) + + # Reconstruction for datetime extraction + recon = images_dir / "Reconstruction" + recon.mkdir() + ini = "[IDENTIFICATION]\nScanUID=1.3.46.423632.12345.2023-03-21100000000\n" + (recon / "recon.INI").write_text(ini, encoding="utf-8") + + session = CBCTSession( + img_dir=images_dir, + dicom_uid="uid1", + acquisition_preset="4ee Pelvis", + session_type="cbct", + treatment_id="Prostate", + ) + + cbct_path = tmp_path / "dest" / "CBCT1" + cbct_path.mkdir(parents=True) + + mapper = LearnFolderMapper(patient_dir, "PRIME001", "Prostate", tmp_path / "out") + counts = mapper.copy_cbct_files(session, cbct_path) + + assert counts["frames_xml"] == 1 + output_xml = cbct_path / "CBCT Projections" / "IPS" / "_Frames.xml" + assert output_xml.exists() + + # Raw copy — original PII should still be present (anonymise step handles it) + tree = ET.parse(output_xml) + root = tree.getroot() + patient = root.find("Patient") + assert patient.find("ID").text == "12345678" + assert patient.find("FirstName").text == "JOHN" + + def test_frames_xml_copied_with_motionview(self, tmp_path): + """_Frames.xml raw-copied to KIM-KV/{img_dir}/ (no anonymisation).""" + patient_dir = tmp_path / "patient_12345678" + images_dir = patient_dir / "IMAGES" / "img_mv01" + images_dir.mkdir(parents=True) + + xml_content = _make_frames_xml( + preset_name="13a KIM S20 MotionView", + patient_id="12345678", + patient_first_name="JOHN", + patient_last_name="SMITH", + ) + (images_dir / "_Frames.xml").write_text(xml_content, encoding="utf-8") + for i in range(3): + (images_dir / f"frame_{i:04d}.his").write_bytes(b"\x00" * 100) + + session = CBCTSession( + img_dir=images_dir, + dicom_uid="uid_mv", + acquisition_preset="13a KIM S20 MotionView", + session_type="motionview", + treatment_id="Prostate", + ) + + fx_path = tmp_path / "dest" / "FX0" + fx_path.mkdir(parents=True) + + mapper = LearnFolderMapper(patient_dir, "PRIME001", "Prostate", tmp_path / "out") + counts = mapper.copy_motionview_files(session, fx_path) + + assert counts["his"] == 3 + assert counts["frames_xml"] == 1 + + output_xml = fx_path / "KIM-KV" / "img_mv01" / "_Frames.xml" + assert output_xml.exists() + + # Raw copy — original PII still present + tree = ET.parse(output_xml) + root = tree.getroot() + patient = root.find("Patient") + assert patient.find("ID").text == "12345678" + assert "12345678" in output_xml.read_text(encoding="utf-8") + + +# --------------------------------------------------------------------------- +# copy_centroid_file +# --------------------------------------------------------------------------- + +class TestCopyCentroidFile: + def test_copy_centroid_file(self, tmp_path): + """Centroid file raw-copied to Patient Files/ (no anonymisation).""" + patient_dir = tmp_path / "patient_12345678" + patient_dir.mkdir() + + centroid = tmp_path / "Centroid_12345678.txt" + centroid.write_text("12345678\nSMITH JOHN\n1.23 4.56 7.89\n", encoding="utf-8") + + mapper = LearnFolderMapper(patient_dir, "PRIME001", "Prostate", tmp_path / "out") + result = mapper.copy_centroid_file(centroid) + + assert result.exists() + # Raw copy — original filename preserved + assert result.name == "Centroid_12345678.txt" + assert result.parent == tmp_path / "out" / "Prostate" / "Patient Files" / "PRIME001" + + text = result.read_text(encoding="utf-8") + lines = text.splitlines() + # Raw copy — PII still present + assert lines[0] == "12345678" + assert lines[1] == "SMITH JOHN" + assert lines[2] == "1.23 4.56 7.89" + + +# --------------------------------------------------------------------------- +# copy_trajectory_logs +# --------------------------------------------------------------------------- + +class TestCopyTrajectoryLogs: + def _setup_trajectory(self, tmp_path): + """Create a patient dir and trajectory base dir with FX01/FX02.""" + patient_dir = tmp_path / "patient_12345678" + patient_dir.mkdir() + + traj_base = tmp_path / "trajectory" + for fx in ("FX01", "FX02"): + fx_dir = traj_base / fx + fx_dir.mkdir(parents=True) + # MarkerLocations with PII path + ml_text = ( + "Marker 1\n" + f"C:\\data\\patient_12345678\\trajectory\\{fx}\\markers.dat\n" + "1.0 2.0 3.0\n" + ) + (fx_dir / "MarkerLocations.txt").write_text(ml_text, encoding="utf-8") + (fx_dir / "MarkerLocationsGA.txt").write_text(ml_text, encoding="utf-8") + # Non-PII files + (fx_dir / "couchShifts.txt").write_text("0.1 0.2 0.3\n", encoding="utf-8") + (fx_dir / "covOutput.txt").write_text("cov data\n", encoding="utf-8") + (fx_dir / "Rotation.txt").write_text("rot data\n", encoding="utf-8") + + return patient_dir, traj_base + + def test_copy_trajectory_logs(self, tmp_path): + """Files placed in correct Trajectory Logs structure.""" + patient_dir, traj_base = self._setup_trajectory(tmp_path) + out = tmp_path / "out" + + mapper = LearnFolderMapper(patient_dir, "PRIME001", "Prostate", out) + counts = mapper.copy_trajectory_logs(traj_base) + + assert counts["fx_count"] == 2 + # 5 files per FX (2 MarkerLocations + 3 plain) × 2 FXs = 10 + assert counts["files_copied"] == 10 + + for fx in ("FX01", "FX02"): + traj_dest = out / "Prostate" / "Trajectory Logs" / "PRIME001" / fx / "Trajectory Logs" + assert traj_dest.is_dir() + assert (traj_dest / "MarkerLocations.txt").exists() + assert (traj_dest / "MarkerLocationsGA.txt").exists() + assert (traj_dest / "couchShifts.txt").exists() + assert (traj_dest / "covOutput.txt").exists() + assert (traj_dest / "Rotation.txt").exists() + + # Treatment Records sibling created + treat_dest = out / "Prostate" / "Trajectory Logs" / "PRIME001" / fx / "Treatment Records" + assert treat_dest.is_dir() + + def test_trajectory_marker_locations_raw_copy(self, tmp_path): + """MarkerLocations copied raw — PII still present (anonymise step handles it).""" + patient_dir, traj_base = self._setup_trajectory(tmp_path) + out = tmp_path / "out" + + mapper = LearnFolderMapper(patient_dir, "PRIME001", "Prostate", out) + mapper.copy_trajectory_logs(traj_base) + + ml_path = ( + out / "Prostate" / "Trajectory Logs" / "PRIME001" + / "FX01" / "Trajectory Logs" / "MarkerLocations.txt" + ) + text = ml_path.read_text(encoding="utf-8") + # Raw copy — original MRN still present + assert "patient_12345678" in text + + def test_trajectory_no_pii_files_unchanged(self, tmp_path): + """couchShifts, covOutput, Rotation copied byte-for-byte.""" + patient_dir, traj_base = self._setup_trajectory(tmp_path) + out = tmp_path / "out" + + mapper = LearnFolderMapper(patient_dir, "PRIME001", "Prostate", out) + mapper.copy_trajectory_logs(traj_base) + + for fname in ("couchShifts.txt", "covOutput.txt", "Rotation.txt"): + src = traj_base / "FX01" / fname + dest = ( + out / "Prostate" / "Trajectory Logs" / "PRIME001" + / "FX01" / "Trajectory Logs" / fname + ) + assert src.read_text() == dest.read_text() diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..4a249c4 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,217 @@ +"""Tests for learn_upload.utils — INI parsing, XML parsing, ScanUID datetime.""" + +import textwrap +from datetime import datetime + +from learn_upload.utils import ( + parse_couch_shifts, + parse_frames_xml, + parse_scan_datetime, + parse_xvi_ini, +) + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +SAMPLE_INI = textwrap.dedent("""\ + [XVI] + SomeUnrelatedKey=ignored + Visibility=1 + + [IDENTIFICATION] + PatientID=15002197 + TreatmentID=WholeBrain-C2Retrt + TreatmentUID=1.3.46.423632.33783920233217242713.224 + ReferenceUID=1.2.840.113854.59112832676204369253232190232540417741 + FirstName=Anonymized + LastName=Anonymized + + [RECONSTRUCTION] + ScanUID=1.3.46.423632.33783920233217242713.224.2023-03-21165402768 + TubeKV=100.0000 + TubeMA=10.0000 + CollimatorName=S20 +""") + +SAMPLE_FRAMES_XML = textwrap.dedent("""\ + + + + WholeBrain-C2Retrt + + + + + +""") + +FULL_FRAMES_XML = textwrap.dedent("""\ + + + + Prostate + + + 4ee Pelvis Soft S20 179-181 + 1.3.46.423632.33783920233217242713.500 + 120.0 + 25.5 + + + + + +""") + + +# --------------------------------------------------------------------------- +# parse_xvi_ini +# --------------------------------------------------------------------------- + +class TestParseXviIni: + def test_all_fields_extracted(self): + result = parse_xvi_ini(SAMPLE_INI) + assert result["PatientID"] == "15002197" + assert result["TreatmentID"] == "WholeBrain-C2Retrt" + assert result["TreatmentUID"] == "1.3.46.423632.33783920233217242713.224" + assert result["ReferenceUID"] == "1.2.840.113854.59112832676204369253232190232540417741" + assert result["FirstName"] == "Anonymized" + assert result["LastName"] == "Anonymized" + assert result["ScanUID"] == "1.3.46.423632.33783920233217242713.224.2023-03-21165402768" + assert result["TubeKV"] == "100.0000" + assert result["TubeMA"] == "10.0000" + assert result["CollimatorName"] == "S20" + + def test_missing_fields_omitted(self): + partial_ini = "[IDENTIFICATION]\nPatientID=12345\n" + result = parse_xvi_ini(partial_ini) + assert result["PatientID"] == "12345" + assert "TubeKV" not in result + assert "ScanUID" not in result + + def test_empty_input(self): + assert parse_xvi_ini("") == {} + + def test_whitespace_stripped(self): + ini = "TubeKV=100.0000 \nTubeMA= 10.0000 \n" + result = parse_xvi_ini(ini) + assert result["TubeKV"] == "100.0000" + assert result["TubeMA"] == "10.0000" + + +# --------------------------------------------------------------------------- +# parse_scan_datetime +# --------------------------------------------------------------------------- + +class TestParseScanDatetime: + def test_valid_scan_uid(self): + uid = "1.3.46.423632.33783920233217242713.224.2023-03-21165402768" + dt = parse_scan_datetime(uid) + assert dt == datetime(2023, 3, 21, 16, 54, 2, 768000) + + def test_different_date(self): + uid = "1.3.46.423632.12345.2024-12-01093015500" + dt = parse_scan_datetime(uid) + assert dt == datetime(2024, 12, 1, 9, 30, 15, 500000) + + def test_no_datetime_returns_none(self): + assert parse_scan_datetime("1.3.46.423632.12345") is None + + def test_invalid_date_returns_none(self): + # Month 13 is invalid + assert parse_scan_datetime("1.3.46.12345.2023-13-01120000000") is None + + def test_empty_string(self): + assert parse_scan_datetime("") is None + + +# --------------------------------------------------------------------------- +# parse_frames_xml +# --------------------------------------------------------------------------- + +class TestParseFramesXml: + def test_valid_xml(self, tmp_path): + xml_file = tmp_path / "_Frames.xml" + xml_file.write_text(SAMPLE_FRAMES_XML, encoding="utf-8") + result = parse_frames_xml(xml_file) + assert result["treatment_id"] == "WholeBrain-C2Retrt" + + def test_missing_treatment_element(self, tmp_path): + xml_file = tmp_path / "_Frames.xml" + xml_file.write_text('', encoding="utf-8") + result = parse_frames_xml(xml_file) + assert result["treatment_id"] is None + + def test_missing_id_element(self, tmp_path): + xml_file = tmp_path / "_Frames.xml" + xml_content = '' + xml_file.write_text(xml_content, encoding="utf-8") + result = parse_frames_xml(xml_file) + assert result["treatment_id"] is None + + def test_nonexistent_file(self, tmp_path): + result = parse_frames_xml(tmp_path / "does_not_exist.xml") + assert result["treatment_id"] is None + + def test_malformed_xml(self, tmp_path): + xml_file = tmp_path / "_Frames.xml" + xml_file.write_text(" Path: + """Create a minimal DICOM file at *filepath* with the given identity tags.""" + filepath.parent.mkdir(parents=True, exist_ok=True) + + file_meta = pydicom.Dataset() + file_meta.MediaStorageSOPClassUID = "1.2.840.10008.5.1.4.1.1.2" + file_meta.MediaStorageSOPInstanceUID = generate_uid() + file_meta.TransferSyntaxUID = ExplicitVRLittleEndian + + ds = FileDataset(str(filepath), {}, file_meta=file_meta, preamble=b"\x00" * 128) + ds.is_little_endian = True + ds.is_implicit_VR = False + ds.PatientName = patient_name + ds.PatientID = patient_id + ds.StudyID = "STUDY1" + ds.StudyInstanceUID = generate_uid() + ds.SeriesInstanceUID = generate_uid() + ds.SOPInstanceUID = file_meta.MediaStorageSOPInstanceUID + ds.SOPClassUID = file_meta.MediaStorageSOPClassUID + ds.save_as(filepath) + return filepath + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + +class TestCleanDirectory: + def test_no_findings_when_clean(self, tmp_path): + """A properly anonymised directory returns an empty findings list.""" + _make_test_dicom(tmp_path / "slice.dcm") + (tmp_path / "notes.txt").write_text("All good", encoding="utf-8") + (tmp_path / "frames.xml").write_text("ok", encoding="utf-8") + + findings = verify_no_pii(tmp_path, ["12345678", "SMITH", "JOHN"]) + assert findings == [] + + def test_nonexistent_directory(self, tmp_path): + findings = verify_no_pii(tmp_path / "nope", ["SMITH"]) + assert findings == [] + + +class TestDicomPii: + def test_pii_in_patient_name(self, tmp_path): + """PII in DICOM PatientName tag is detected.""" + _make_test_dicom( + tmp_path / "bad.dcm", + patient_name="SMITH^JOHN", + patient_id="PAT01", + ) + findings = verify_no_pii(tmp_path, ["SMITH"]) + assert len(findings) >= 1 + dicom_findings = [f for f in findings if "tag" in f["location"]] + assert any("SMITH" == f["matched"] for f in dicom_findings) + + def test_pii_in_patient_id(self, tmp_path): + """PII in DICOM PatientID tag is detected.""" + _make_test_dicom( + tmp_path / "bad.dcm", + patient_name="PAT01^Prostate", + patient_id="12345678", + ) + findings = verify_no_pii(tmp_path, ["12345678"]) + dicom_findings = [f for f in findings if "tag" in f["location"]] + assert any("12345678" == f["matched"] for f in dicom_findings) + + +class TestFilenamePii: + def test_pii_in_filename(self, tmp_path): + """PII substring in a filename is detected.""" + (tmp_path / "DCMRT_Plan(SMITH JOHN).dcm").write_bytes(b"") + findings = verify_no_pii(tmp_path, ["SMITH"]) + filename_findings = [f for f in findings if f["location"] == "filename"] + assert len(filename_findings) == 1 + assert filename_findings[0]["matched"] == "SMITH" + + +class TestXmlPii: + def test_pii_in_xml(self, tmp_path): + """PII in XML file content is detected.""" + xml = "12345678SMITH" + (tmp_path / "frames.xml").write_text(xml, encoding="utf-8") + + findings = verify_no_pii(tmp_path, ["12345678", "SMITH"]) + xml_findings = [f for f in findings if f["location"] == "xml text"] + assert len(xml_findings) == 2 + + +class TestTxtPii: + def test_pii_in_txt(self, tmp_path): + """PII in a plain text file is detected.""" + (tmp_path / "centroid.txt").write_text( + "Centroid for patient 12345678\n1.0 2.0 3.0", + encoding="utf-8", + ) + findings = verify_no_pii(tmp_path, ["12345678"]) + txt_findings = [f for f in findings if f["location"] == "text content"] + assert len(txt_findings) == 1 + assert txt_findings[0]["matched"] == "12345678" + + +class TestCaseInsensitivity: + def test_lowercase_match(self, tmp_path): + """Search is case-insensitive (lowercase in file, uppercase query).""" + (tmp_path / "notes.txt").write_text("patient smith", encoding="utf-8") + findings = verify_no_pii(tmp_path, ["SMITH"]) + assert len(findings) >= 1 + + def test_mixed_case_filename(self, tmp_path): + """Filename matching is case-insensitive.""" + (tmp_path / "Smith_data.txt").write_text("clean", encoding="utf-8") + findings = verify_no_pii(tmp_path, ["SMITH"]) + filename_findings = [f for f in findings if f["location"] == "filename"] + assert len(filename_findings) == 1