From 4de46a05b3730256f093ea10defbe08663c86c60 Mon Sep 17 00:00:00 2001 From: Chris Doyle Date: Wed, 4 Feb 2026 11:38:52 -0500 Subject: [PATCH 1/4] add logging to external file --- lib/tasks/detect_duplicates.rake | 36 +++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/lib/tasks/detect_duplicates.rake b/lib/tasks/detect_duplicates.rake index ad10814..8b78c0f 100644 --- a/lib/tasks/detect_duplicates.rake +++ b/lib/tasks/detect_duplicates.rake @@ -1,20 +1,29 @@ # frozen_string_literal: true require "csv" +require "logger" namespace :duplicates do desc "Detect and list duplicates according to Rule 3" task detect: :environment do - puts "Starting Rule 3 duplicate detection..." + logger = Logger.new("log/isilon-duplicates-detect.log") + log = lambda do |message| + puts message + logger.info(message) + end + + log.call("Starting Rule 3 duplicate detection...") # Find all assets outside main areas with non-empty checksums output_path = "log/isilon-duplicate-paths.csv" processed = 0 batch_size = 1000 progress_interval = batch_size * 5 + log_every = ENV.fetch("DUPLICATES_LOG_EVERY", "500").to_i + slow_seconds = ENV.fetch("DUPLICATES_SLOW_SECONDS", "10").to_f - puts "Scanning assets with matching checksums..." - puts "Processing in batches of #{batch_size}..." + log.call("Scanning assets with matching checksums...") + log.call("Processing in batches of #{batch_size}...") duplicate_checksums = IsilonAsset.where("NULLIF(TRIM(file_checksum), '') IS NOT NULL") .group(:file_checksum) @@ -36,7 +45,9 @@ namespace :duplicates do written = 0 CSV.open(output_path, "w", write_headers: true, headers: [ "FullPath" ]) do |csv| duplicate_checksums.each_slice(batch_size) do |checksum_batch| - checksum_batch.each do |checksum| + batch_started_at = Process.clock_gettime(Process::CLOCK_MONOTONIC) + checksum_batch.each_with_index do |checksum, index| + started_at = Process.clock_gettime(Process::CLOCK_MONOTONIC) asset_ids = IsilonAsset.where(file_checksum: checksum).pluck(:id) next if asset_ids.empty? @@ -64,20 +75,29 @@ namespace :duplicates do csv << [ full_path ] written += 1 end + + elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - started_at + global_index = processed + index + 1 + if (global_index % log_every == 0) || elapsed >= slow_seconds || asset_ids.length >= (BATCH_UPDATE_SIZE * 2) + log.call("Processed checksum #{global_index}/#{duplicate_checksums.length} (assets=#{asset_ids.length}) in #{format('%.2f', elapsed)}s") + end end processed += checksum_batch.size GC.start if processed % progress_interval == 0 - puts "Processed #{processed} checksum groups..." + log.call("Processed #{processed} checksum groups...") end + + batch_elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - batch_started_at + log.call("Batch complete (#{checksum_batch.size} checksums) in #{format('%.2f', batch_elapsed)}s") end end - puts "\n✓ Complete!" - puts "Processed: #{processed} checksum groups" - puts "Duplicate paths exported to #{output_path} (#{written} rows)" + log.call("\n✓ Complete!") + log.call("Processed: #{processed} checksum groups") + log.call("Duplicate paths exported to #{output_path} (#{written} rows)") end desc "Show duplicate statistics" From 6406f9a20e341f409604fd49e24e83931409003d Mon Sep 17 00:00:00 2001 From: Chris Doyle Date: Wed, 4 Feb 2026 11:47:36 -0500 Subject: [PATCH 2/4] fix test --- lib/tasks/detect_duplicates.rake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/tasks/detect_duplicates.rake b/lib/tasks/detect_duplicates.rake index 8b78c0f..e883b94 100644 --- a/lib/tasks/detect_duplicates.rake +++ b/lib/tasks/detect_duplicates.rake @@ -21,6 +21,7 @@ namespace :duplicates do progress_interval = batch_size * 5 log_every = ENV.fetch("DUPLICATES_LOG_EVERY", "500").to_i slow_seconds = ENV.fetch("DUPLICATES_SLOW_SECONDS", "10").to_f + large_group_size = ENV.fetch("DUPLICATES_LARGE_GROUP_SIZE", "20000").to_i log.call("Scanning assets with matching checksums...") log.call("Processing in batches of #{batch_size}...") @@ -78,7 +79,7 @@ namespace :duplicates do elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - started_at global_index = processed + index + 1 - if (global_index % log_every == 0) || elapsed >= slow_seconds || asset_ids.length >= (BATCH_UPDATE_SIZE * 2) + if (global_index % log_every == 0) || elapsed >= slow_seconds || asset_ids.length >= large_group_size log.call("Processed checksum #{global_index}/#{duplicate_checksums.length} (assets=#{asset_ids.length}) in #{format('%.2f', elapsed)}s") end end From dbc173b1c3f75aaf9eda8ffa7de73e1f2ab6ab3f Mon Sep 17 00:00:00 2001 From: Chris Doyle Date: Thu, 5 Feb 2026 14:44:28 -0500 Subject: [PATCH 3/4] update csv export to child only, no parents --- lib/tasks/detect_duplicates.rake | 20 ++++++++++++++----- spec/tasks/duplicates_rake_spec.rb | 31 ++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 5 deletions(-) diff --git a/lib/tasks/detect_duplicates.rake b/lib/tasks/detect_duplicates.rake index e883b94..e8f0174 100644 --- a/lib/tasks/detect_duplicates.rake +++ b/lib/tasks/detect_duplicates.rake @@ -26,6 +26,8 @@ namespace :duplicates do log.call("Scanning assets with matching checksums...") log.call("Processing in batches of #{batch_size}...") + main_volume_names = %w[Deposit Media-Repository] + duplicate_checksums = IsilonAsset.where("NULLIF(TRIM(file_checksum), '') IS NOT NULL") .group(:file_checksum) .having("COUNT(*) > 1") @@ -44,7 +46,8 @@ namespace :duplicates do end written = 0 - CSV.open(output_path, "w", write_headers: true, headers: [ "FullPath" ]) do |csv| + headers = [ "File", "Path", "Checksum", "File Size" ] + CSV.open(output_path, "w", write_headers: true, headers: headers) do |csv| duplicate_checksums.each_slice(batch_size) do |checksum_batch| batch_started_at = Process.clock_gettime(Process::CLOCK_MONOTONIC) checksum_batch.each_with_index do |checksum, index| @@ -67,13 +70,20 @@ namespace :duplicates do DuplicateGroupMembership.insert_all(rows) if rows.any? IsilonAsset.where(id: asset_ids).update_all(has_duplicates: true) - IsilonAsset.where(id: asset_ids) - .includes(parent_folder: :volume) - .find_each do |asset| + main_scope = IsilonAsset.joins(parent_folder: :volume) + .where(file_checksum: checksum, volumes: { name: main_volume_names }) + outside_scope = IsilonAsset.joins(parent_folder: :volume) + .where(file_checksum: checksum) + .where.not(volumes: { name: main_volume_names }) + + next unless main_scope.exists? + next unless outside_scope.exists? + + outside_scope.includes(parent_folder: :volume).find_each do |asset| full_path = build_full_path.call(asset) next unless full_path - csv << [ full_path ] + csv << [ asset.isilon_name, full_path, checksum, asset.file_size ] written += 1 end diff --git a/spec/tasks/duplicates_rake_spec.rb b/spec/tasks/duplicates_rake_spec.rb index 75c8036..2281b8a 100644 --- a/spec/tasks/duplicates_rake_spec.rb +++ b/spec/tasks/duplicates_rake_spec.rb @@ -18,6 +18,13 @@ let!(:media_volume) { create(:volume, name: "Media-Repository") } let!(:deposit_folder) { create(:isilon_folder, volume: deposit_volume, full_path: "/Deposit/project") } let!(:media_folder) { create(:isilon_folder, volume: media_volume, full_path: "/Media-Repository/project") } + let(:export_path) { Rails.root.join("log/isilon-duplicate-paths.csv") } + let(:detect_log_path) { Rails.root.join("log/isilon-duplicates-detect.log") } + + after do + File.delete(export_path) if File.exist?(export_path) + File.delete(detect_log_path) if File.exist?(detect_log_path) + end it "groups assets with matching checksums" do original = create(:isilon_asset, parent_folder: deposit_folder, file_checksum: "abc", file_size: "100") @@ -37,6 +44,30 @@ expect(IsilonAsset.where(has_duplicates: true).count).to eq(4) expect(IsilonAsset.where(has_duplicates: false).count).to eq(2) end + + it "exports child rows with checksum and file size for checksums shared across main and outside volumes" do + other_volume = create(:volume, name: "Other") + other_folder = create(:isilon_folder, volume: other_volume, full_path: "/Other/project") + + create(:isilon_asset, parent_folder: deposit_folder, isilon_path: "/project/main.txt", isilon_name: "main.txt", file_checksum: "abc", file_size: "100") + create(:isilon_asset, parent_folder: media_folder, isilon_path: "/project/main2.txt", isilon_name: "main2.txt", file_checksum: "abc", file_size: "100") + outside_asset = create(:isilon_asset, parent_folder: other_folder, isilon_path: "/project/out.txt", isilon_name: "out.txt", file_checksum: "abc", file_size: "100") + create(:isilon_asset, parent_folder: other_folder, isilon_path: "/project/solo.txt", isilon_name: "solo.txt", file_checksum: "xyz", file_size: "100") + + Rake::Task["duplicates:detect"].invoke + + exported = CSV.read(export_path, headers: true) + child_row = exported.find { |row| row["File"] == "out.txt" } + solo_row = exported.find { |row| row["File"] == "solo.txt" } + + expect(child_row).to be_present + expect(child_row["Path"]).to eq("/Other/project/out.txt") + expect(child_row["Checksum"]).to eq("abc") + expect(child_row["File Size"]).to eq("100") + expect(exported.find { |row| row["File"] == "main.txt" }).to be_nil + expect(exported.find { |row| row["File"] == "main2.txt" }).to be_nil + expect(solo_row).to be_nil + end end describe "duplicates:clear" do From 5162a475b650af8ed44e1fb95541f33a3d401b78 Mon Sep 17 00:00:00 2001 From: Chris Doyle Date: Thu, 5 Feb 2026 14:49:09 -0500 Subject: [PATCH 4/4] remove external logging as csv file has smaller scope --- lib/tasks/detect_duplicates.rake | 26 +++++++++----------------- spec/tasks/duplicates_rake_spec.rb | 3 --- 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/lib/tasks/detect_duplicates.rake b/lib/tasks/detect_duplicates.rake index e8f0174..4107323 100644 --- a/lib/tasks/detect_duplicates.rake +++ b/lib/tasks/detect_duplicates.rake @@ -1,18 +1,10 @@ # frozen_string_literal: true require "csv" -require "logger" - namespace :duplicates do desc "Detect and list duplicates according to Rule 3" task detect: :environment do - logger = Logger.new("log/isilon-duplicates-detect.log") - log = lambda do |message| - puts message - logger.info(message) - end - - log.call("Starting Rule 3 duplicate detection...") + puts "Starting Rule 3 duplicate detection..." # Find all assets outside main areas with non-empty checksums output_path = "log/isilon-duplicate-paths.csv" @@ -23,8 +15,8 @@ namespace :duplicates do slow_seconds = ENV.fetch("DUPLICATES_SLOW_SECONDS", "10").to_f large_group_size = ENV.fetch("DUPLICATES_LARGE_GROUP_SIZE", "20000").to_i - log.call("Scanning assets with matching checksums...") - log.call("Processing in batches of #{batch_size}...") + puts "Scanning assets with matching checksums..." + puts "Processing in batches of #{batch_size}..." main_volume_names = %w[Deposit Media-Repository] @@ -90,7 +82,7 @@ namespace :duplicates do elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - started_at global_index = processed + index + 1 if (global_index % log_every == 0) || elapsed >= slow_seconds || asset_ids.length >= large_group_size - log.call("Processed checksum #{global_index}/#{duplicate_checksums.length} (assets=#{asset_ids.length}) in #{format('%.2f', elapsed)}s") + puts "Processed checksum #{global_index}/#{duplicate_checksums.length} (assets=#{asset_ids.length}) in #{format('%.2f', elapsed)}s" end end @@ -98,17 +90,17 @@ namespace :duplicates do GC.start if processed % progress_interval == 0 - log.call("Processed #{processed} checksum groups...") + puts "Processed #{processed} checksum groups..." end batch_elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - batch_started_at - log.call("Batch complete (#{checksum_batch.size} checksums) in #{format('%.2f', batch_elapsed)}s") + puts "Batch complete (#{checksum_batch.size} checksums) in #{format('%.2f', batch_elapsed)}s" end end - log.call("\n✓ Complete!") - log.call("Processed: #{processed} checksum groups") - log.call("Duplicate paths exported to #{output_path} (#{written} rows)") + puts "\n✓ Complete!" + puts "Processed: #{processed} checksum groups" + puts "Duplicate paths exported to #{output_path} (#{written} rows)" end desc "Show duplicate statistics" diff --git a/spec/tasks/duplicates_rake_spec.rb b/spec/tasks/duplicates_rake_spec.rb index 2281b8a..4cfce85 100644 --- a/spec/tasks/duplicates_rake_spec.rb +++ b/spec/tasks/duplicates_rake_spec.rb @@ -19,11 +19,8 @@ let!(:deposit_folder) { create(:isilon_folder, volume: deposit_volume, full_path: "/Deposit/project") } let!(:media_folder) { create(:isilon_folder, volume: media_volume, full_path: "/Media-Repository/project") } let(:export_path) { Rails.root.join("log/isilon-duplicate-paths.csv") } - let(:detect_log_path) { Rails.root.join("log/isilon-duplicates-detect.log") } - after do File.delete(export_path) if File.exist?(export_path) - File.delete(detect_log_path) if File.exist?(detect_log_path) end it "groups assets with matching checksums" do