From b9debde66ca2536d556d69d678b9678980a0cce3 Mon Sep 17 00:00:00 2001 From: Andrey Fedorov Date: Fri, 23 Jan 2026 11:26:41 -0500 Subject: [PATCH 1/2] enh: add seg_index describing Segmentation series available Relies on a slightly modified version of the IDC segmentations table - this should be fixed once the query in this PR goes live: https://github.com/ImagingDataCommons/etl_flow/pull/129 --- assets/README.md | 3 +- assets/seg_index.sql | 153 +++++++++++++++++++++++++++++++++++++++++++ hatch_build.py | 1 + 3 files changed, 155 insertions(+), 2 deletions(-) create mode 100644 assets/seg_index.sql diff --git a/assets/README.md b/assets/README.md index b72c27c..7b96dbf 100644 --- a/assets/README.md +++ b/assets/README.md @@ -1,3 +1,2 @@ This folder contains SQL scripts that are used to generate tables that are -attached to the releases as assets. Initially, those will be generated and -attached manually, but in the future this process may be automated. +attached to the releases as assets. \ No newline at end of file diff --git a/assets/seg_index.sql b/assets/seg_index.sql new file mode 100644 index 0000000..4ef9a67 --- /dev/null +++ b/assets/seg_index.sql @@ -0,0 +1,153 @@ +# table-description: +# This table contains one row per DICOM Segmentation +# SeriesInstanceUID available from IDC, and captures +# key metadata about the segmentation series including +# the number of segments, segmentation type, algorithm +# type and name, and the segmented image series. +WITH + segmentations AS ( + WITH + segmentations AS ( + WITH + segs AS ( + SELECT + PatientID, + StudyInstanceUID, + SeriesInstanceUID, + SOPInstanceUID, + FrameOfReferenceUID, + SegmentSequence, + SegmentationType + FROM + `bigquery-public-data.idc_current.dicom_metadata` + WHERE + # more reliable than Modality = "SEG" + SOPClassUID = "1.2.840.10008.5.1.4.1.1.66.4" + ) + SELECT + PatientID, + StudyInstanceUID, + SeriesInstanceUID, + SOPInstanceUID, + FrameOfReferenceUID, + SegmentationType, + CASE ARRAY_LENGTH(unnested.AnatomicRegionSequence) + WHEN 0 THEN NULL + ELSE + STRUCT( + unnested.AnatomicRegionSequence[OFFSET(0)].CodeValue + AS CodeValue, + unnested.AnatomicRegionSequence[OFFSET(0)].CodingSchemeDesignator + AS CodingSchemeDesignator, + unnested.AnatomicRegionSequence[OFFSET(0)].CodeMeaning + AS CodeMeaning) + END + AS AnatomicRegion, + CASE + ( + ARRAY_LENGTH(unnested.AnatomicRegionSequence) > 0 + AND ARRAY_LENGTH( + unnested.AnatomicRegionSequence[OFFSET(0)].AnatomicRegionModifierSequence) + > 0) + WHEN TRUE + THEN + unnested.AnatomicRegionSequence[OFFSET(0)].AnatomicRegionModifierSequence[ + OFFSET( + 0)] # unnested.AnatomicRegionSequence[OFFSET(0)].AnatomicRegionModifierSequence, + ELSE + NULL + END + AS AnatomicRegionModifier, + CASE ARRAY_LENGTH(unnested.SegmentedPropertyCategoryCodeSequence) + WHEN 0 THEN NULL + ELSE + unnested.SegmentedPropertyCategoryCodeSequence[ + OFFSET(0)] + END + AS SegmentedPropertyCategory, + CASE ARRAY_LENGTH(unnested.SegmentedPropertyTypeCodeSequence) + WHEN 0 THEN NULL + ELSE + unnested.SegmentedPropertyTypeCodeSequence[ + OFFSET(0)] + END + AS SegmentedPropertyType, + # unnested.SegmentedPropertyTypeCodeSequence, + # unnested.SegmentedPropertyTypeModifierCodeSequence, + unnested.SegmentAlgorithmType, + # for unknown reason, this attribute is REPEATED in our BigQuery schema + unnested.SegmentAlgorithmName[SAFE_OFFSET(0)] as SegmentAlgorithmName, + unnested.SegmentNumber, + unnested.TrackingUID, + unnested.TrackingID + FROM + segs + CROSS JOIN + UNNEST(SegmentSequence) AS unnested + ), + sampled_sops AS ( + SELECT + SOPInstanceUID AS seg_SOPInstanceUID, + ReferencedSeriesSequence[SAFE_OFFSET(0)].ReferencedInstanceSequence[SAFE_OFFSET(0)].ReferencedSOPInstanceUID + AS rss_one, + ReferencedImageSequence[SAFE_OFFSET(0)].ReferencedSOPInstanceUID + AS ris_one, + SourceImageSequence[SAFE_OFFSET(0)].ReferencedSOPInstanceUID + AS sis_one + FROM + `bigquery-public-data.idc_current.dicom_all` + WHERE + Modality = "SEG" + AND SOPClassUID = "1.2.840.10008.5.1.4.1.1.66.4" + ), + coalesced_ref AS ( + SELECT + *, + COALESCE(rss_one, ris_one, sis_one) AS referenced_sop + FROM + sampled_sops + ) + SELECT + segmentations.*, + dicom_all.SeriesInstanceUID AS segmented_SeriesInstanceUID, + CONCAT( + "https://viewer.imaging.datacommons.cancer.gov/viewer/", + segmentations.StudyInstanceUID, + "?seriesInstanceUID=", + segmentations.SeriesInstanceUID, + ",", + dicom_all.SeriesInstanceUID) AS viewer_url, + FROM + coalesced_ref + JOIN + `bigquery-public-data.idc_current.dicom_all` AS dicom_all + ON + coalesced_ref.referenced_sop = dicom_all.SOPInstanceUID + RIGHT JOIN + segmentations + ON + segmentations.SOPInstanceUID = coalesced_ref.seg_SOPInstanceUID + ) +SELECT + # description: + # DICOM SeriesInstanceUID identifier of the segmentation series + SeriesInstanceUID, + # description: + # Type of segmentation as defined in DICOM SegmentationType attribute + any_value(SegmentationType) AS SegmentationType, + # description: + # Number of segments in the segmentation series obtained by counting distinct DICOM SegmentNumber values in the DICOM SegmentatationSequence + COUNT(DISTINCT (SegmentNumber)) total_segments, + # description: + # Segmentation algorithm type as available in DICOM SegmentAlgorithmType + string_agg(DISTINCT (SegmentAlgorithmType), ',') AS AlgorithmType, + # description: + # Segmentation algorithm name as available in DICOM SegmentAlgorithmName + string_agg(DISTINCT (SegmentAlgorithmName), ',') AS AlgorithmName, + # description: + # SeriesInstanceUID of the referenced image series that the segmentation applies to + any_value(segmentations.segmented_SeriesInstanceUID) + AS segmented_SeriesInstanceUID +FROM segmentations +GROUP BY SeriesInstanceUID +ORDER BY SegmentationType DESC diff --git a/hatch_build.py b/hatch_build.py index f97db8b..555f143 100644 --- a/hatch_build.py +++ b/hatch_build.py @@ -21,6 +21,7 @@ class IDCBuildHook(BuildHookInterface): "sm_index.parquet", "sm_instance_index.parquet", "clinical_index.parquet", + "seg_index.parquet", } def _prune_excluded_parquet_files(self) -> None: From eb1c65c3b58f938d95d5900a689a25bcc54653db Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 23 Jan 2026 16:33:45 +0000 Subject: [PATCH 2/2] style: pre-commit fixes --- assets/README.md | 2 +- assets/seg_index.sql | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/assets/README.md b/assets/README.md index 7b96dbf..1943f7e 100644 --- a/assets/README.md +++ b/assets/README.md @@ -1,2 +1,2 @@ This folder contains SQL scripts that are used to generate tables that are -attached to the releases as assets. \ No newline at end of file +attached to the releases as assets. diff --git a/assets/seg_index.sql b/assets/seg_index.sql index 4ef9a67..5ee825a 100644 --- a/assets/seg_index.sql +++ b/assets/seg_index.sql @@ -1,8 +1,8 @@ # table-description: -# This table contains one row per DICOM Segmentation -# SeriesInstanceUID available from IDC, and captures -# key metadata about the segmentation series including -# the number of segments, segmentation type, algorithm +# This table contains one row per DICOM Segmentation +# SeriesInstanceUID available from IDC, and captures +# key metadata about the segmentation series including +# the number of segments, segmentation type, algorithm # type and name, and the segmented image series. WITH segmentations AS (