Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 30 additions & 7 deletions lib/tasks/detect_duplicates.rake
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# frozen_string_literal: true

require "csv"

namespace :duplicates do
desc "Detect and list duplicates according to Rule 3"
task detect: :environment do
Expand All @@ -12,10 +11,15 @@ namespace :duplicates do
processed = 0
batch_size = 1000
progress_interval = batch_size * 5
log_every = ENV.fetch("DUPLICATES_LOG_EVERY", "500").to_i
slow_seconds = ENV.fetch("DUPLICATES_SLOW_SECONDS", "10").to_f
large_group_size = ENV.fetch("DUPLICATES_LARGE_GROUP_SIZE", "20000").to_i

puts "Scanning assets with matching checksums..."
puts "Processing in batches of #{batch_size}..."

main_volume_names = %w[Deposit Media-Repository]

duplicate_checksums = IsilonAsset.where("NULLIF(TRIM(file_checksum), '') IS NOT NULL")
.group(:file_checksum)
.having("COUNT(*) > 1")
Expand All @@ -34,9 +38,12 @@ namespace :duplicates do
end

written = 0
CSV.open(output_path, "w", write_headers: true, headers: [ "FullPath" ]) do |csv|
headers = [ "File", "Path", "Checksum", "File Size" ]
CSV.open(output_path, "w", write_headers: true, headers: headers) do |csv|
duplicate_checksums.each_slice(batch_size) do |checksum_batch|
checksum_batch.each do |checksum|
batch_started_at = Process.clock_gettime(Process::CLOCK_MONOTONIC)
checksum_batch.each_with_index do |checksum, index|
started_at = Process.clock_gettime(Process::CLOCK_MONOTONIC)
asset_ids = IsilonAsset.where(file_checksum: checksum).pluck(:id)
next if asset_ids.empty?

Expand All @@ -55,15 +62,28 @@ namespace :duplicates do
DuplicateGroupMembership.insert_all(rows) if rows.any?
IsilonAsset.where(id: asset_ids).update_all(has_duplicates: true)

IsilonAsset.where(id: asset_ids)
.includes(parent_folder: :volume)
.find_each do |asset|
main_scope = IsilonAsset.joins(parent_folder: :volume)
.where(file_checksum: checksum, volumes: { name: main_volume_names })
outside_scope = IsilonAsset.joins(parent_folder: :volume)
.where(file_checksum: checksum)
.where.not(volumes: { name: main_volume_names })

next unless main_scope.exists?
next unless outside_scope.exists?

outside_scope.includes(parent_folder: :volume).find_each do |asset|
full_path = build_full_path.call(asset)
next unless full_path

csv << [ full_path ]
csv << [ asset.isilon_name, full_path, checksum, asset.file_size ]
written += 1
end

elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - started_at
global_index = processed + index + 1
if (global_index % log_every == 0) || elapsed >= slow_seconds || asset_ids.length >= large_group_size
puts "Processed checksum #{global_index}/#{duplicate_checksums.length} (assets=#{asset_ids.length}) in #{format('%.2f', elapsed)}s"
end
end

processed += checksum_batch.size
Expand All @@ -72,6 +92,9 @@ namespace :duplicates do
if processed % progress_interval == 0
puts "Processed #{processed} checksum groups..."
end

batch_elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - batch_started_at
puts "Batch complete (#{checksum_batch.size} checksums) in #{format('%.2f', batch_elapsed)}s"
end
end

Expand Down
28 changes: 28 additions & 0 deletions spec/tasks/duplicates_rake_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
let!(:media_volume) { create(:volume, name: "Media-Repository") }
let!(:deposit_folder) { create(:isilon_folder, volume: deposit_volume, full_path: "/Deposit/project") }
let!(:media_folder) { create(:isilon_folder, volume: media_volume, full_path: "/Media-Repository/project") }
let(:export_path) { Rails.root.join("log/isilon-duplicate-paths.csv") }
after do
File.delete(export_path) if File.exist?(export_path)
end

it "groups assets with matching checksums" do
original = create(:isilon_asset, parent_folder: deposit_folder, file_checksum: "abc", file_size: "100")
Expand All @@ -37,6 +41,30 @@
expect(IsilonAsset.where(has_duplicates: true).count).to eq(4)
expect(IsilonAsset.where(has_duplicates: false).count).to eq(2)
end

it "exports child rows with checksum and file size for checksums shared across main and outside volumes" do
other_volume = create(:volume, name: "Other")
other_folder = create(:isilon_folder, volume: other_volume, full_path: "/Other/project")

create(:isilon_asset, parent_folder: deposit_folder, isilon_path: "/project/main.txt", isilon_name: "main.txt", file_checksum: "abc", file_size: "100")
create(:isilon_asset, parent_folder: media_folder, isilon_path: "/project/main2.txt", isilon_name: "main2.txt", file_checksum: "abc", file_size: "100")
outside_asset = create(:isilon_asset, parent_folder: other_folder, isilon_path: "/project/out.txt", isilon_name: "out.txt", file_checksum: "abc", file_size: "100")
create(:isilon_asset, parent_folder: other_folder, isilon_path: "/project/solo.txt", isilon_name: "solo.txt", file_checksum: "xyz", file_size: "100")

Rake::Task["duplicates:detect"].invoke

exported = CSV.read(export_path, headers: true)
child_row = exported.find { |row| row["File"] == "out.txt" }
solo_row = exported.find { |row| row["File"] == "solo.txt" }

expect(child_row).to be_present
expect(child_row["Path"]).to eq("/Other/project/out.txt")
expect(child_row["Checksum"]).to eq("abc")
expect(child_row["File Size"]).to eq("100")
expect(exported.find { |row| row["File"] == "main.txt" }).to be_nil
expect(exported.find { |row| row["File"] == "main2.txt" }).to be_nil
expect(solo_row).to be_nil
end
end

describe "duplicates:clear" do
Expand Down