From ffcaccdffee949e6ceafd80f5570db9fc6ef5dbf Mon Sep 17 00:00:00 2001 From: Brian Riley Date: Wed, 24 Jun 2026 11:09:05 -0700 Subject: [PATCH 1/3] update robots.txt and remove old cap step that replaces robots.txt --- CHANGELOG.md | 4 ++++ config/deploy.rb | 9 --------- config/robots.txt | 9 --------- public/robots.txt | 8 ++++---- 4 files changed, 8 insertions(+), 22 deletions(-) delete mode 100644 config/robots.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index 59f2931125..a07a93ce0d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ ## DMPTool Releases +### v5.5.9 +- Updated Robots.txt +- Removed old Capistrano deploy step that was replacing robots.txt + ### v5.58 - Added custom override for `idsc.miami.edu` and `miami.edu` diff --git a/config/deploy.rb b/config/deploy.rb index 272f0012b4..021fdfee15 100644 --- a/config/deploy.rb +++ b/config/deploy.rb @@ -42,8 +42,6 @@ namespace :deploy do before :compile_assets, 'deploy:retrieve_credentials' - after :deploy, 'dmptool_assets:copy_robots' - after :deploy, 'git:version' after :deploy, 'cleanup:remove_example_configs' after :deploy, 'deploy:chrome_install' @@ -113,11 +111,4 @@ execute "cd #{release_path} && bin/rails assets:clobber && bin/rails assets:precompile" end end - - desc 'Copy over the robots.txt file' - task :copy_robots do - on roles(:app), wait: 1 do - execute "cp -r #{release_path}/config/robots.txt #{release_path}/public/robots.txt" - end - end end diff --git a/config/robots.txt b/config/robots.txt deleted file mode 100644 index 16a87f98c2..0000000000 --- a/config/robots.txt +++ /dev/null @@ -1,9 +0,0 @@ -# See http://www.robotstxt.org/wc/norobots.html for documentation on how to use the robots.txt file -# -# To ban all spiders from the entire site uncomment the next two lines: -# User-Agent: * -# Disallow: / - -# Prevent spiders from downloading PDF files. -User-agent: * -Disallow: /pdf/ diff --git a/public/robots.txt b/public/robots.txt index 60cf8dfe18..b32b874606 100644 --- a/public/robots.txt +++ b/public/robots.txt @@ -5,12 +5,12 @@ # Disallow: / # Prevent spiders from downloading PDF files. - User-agent: * Crawl-delay: 5 -Disallow: /pdf/ Disallow: /rails/active_storage/ +Disallow: /narratives/*.pdf +# Explicitly name certain bots to ensure they are blocked from downloading PDF files. User-agent: Amazonbot User-agent: Amzn-SearchBot User-agent: AmazonProductDiscoverybot @@ -21,9 +21,9 @@ User-agent: ClaudeBot User-agent: CCBot User-agent: Googlebot User-agent: Meta-ExternalAgent +User-Agent: PetalBot User-agent: Turnitin User-agent: TurnitinBot Crawl-delay: 10 -Disallow: /pdf/ Disallow: /rails/active_storage/ - +Disallow: /narratives/*.pdf From ff6a6adc65ddc1d2d0df176c211a9361b458888c Mon Sep 17 00:00:00 2001 From: Brian Riley Date: Wed, 24 Jun 2026 11:22:57 -0700 Subject: [PATCH 2/3] readd cap step to co[y robots. update robits file in config dir --- CHANGELOG.md | 3 +-- config/deploy.rb | 9 +++++++++ config/robots.txt | 27 +++++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 2 deletions(-) create mode 100644 config/robots.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index a07a93ce0d..dcfe326e3d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,8 +3,7 @@ ## DMPTool Releases ### v5.5.9 -- Updated Robots.txt -- Removed old Capistrano deploy step that was replacing robots.txt +- Updated config/robots.txt ### v5.58 - Added custom override for `idsc.miami.edu` and `miami.edu` diff --git a/config/deploy.rb b/config/deploy.rb index 021fdfee15..272f0012b4 100644 --- a/config/deploy.rb +++ b/config/deploy.rb @@ -42,6 +42,8 @@ namespace :deploy do before :compile_assets, 'deploy:retrieve_credentials' + after :deploy, 'dmptool_assets:copy_robots' + after :deploy, 'git:version' after :deploy, 'cleanup:remove_example_configs' after :deploy, 'deploy:chrome_install' @@ -111,4 +113,11 @@ execute "cd #{release_path} && bin/rails assets:clobber && bin/rails assets:precompile" end end + + desc 'Copy over the robots.txt file' + task :copy_robots do + on roles(:app), wait: 1 do + execute "cp -r #{release_path}/config/robots.txt #{release_path}/public/robots.txt" + end + end end diff --git a/config/robots.txt b/config/robots.txt new file mode 100644 index 0000000000..df6326adc3 --- /dev/null +++ b/config/robots.txt @@ -0,0 +1,27 @@ +# See https://www.robotstxt.org/robotstxt.html for documentation on how to use the robots.txt file +# +# This file is copied into place during the Capistrano deploy process. + +# Prevent spiders from downloading PDF files. +User-agent: * +Crawl-delay: 5 +Disallow: /rails/active_storage/ +Disallow: /narratives/*.pdf + +# Explicitly name certain bots to ensure they are blocked from downloading PDF files. +User-agent: Amazonbot +User-agent: Amzn-SearchBot +User-agent: AmazonProductDiscoverybot +User-agent: Bytespider +User-agent: GPTBot +User-agent: OAI-SearchBot +User-agent: ClaudeBot +User-agent: CCBot +User-agent: Googlebot +User-agent: Meta-ExternalAgent +User-Agent: PetalBot +User-agent: Turnitin +User-agent: TurnitinBot +Crawl-delay: 10 +Disallow: /rails/active_storage/ +Disallow: /narratives/*.pdf From a8535092aedc074969561e12bfb0cda25a919602 Mon Sep 17 00:00:00 2001 From: Brian Riley Date: Wed, 24 Jun 2026 11:33:20 -0700 Subject: [PATCH 3/3] removed old uneeded public/robots.txt. Cap copys the one in config/robots.txt over after compiling assets --- public/robots.txt | 29 ----------------------------- 1 file changed, 29 deletions(-) delete mode 100644 public/robots.txt diff --git a/public/robots.txt b/public/robots.txt deleted file mode 100644 index b32b874606..0000000000 --- a/public/robots.txt +++ /dev/null @@ -1,29 +0,0 @@ -# See https://www.robotstxt.org/robotstxt.html for documentation on how to use the robots.txt file -# -# To ban all spiders from the entire site uncomment the next two lines: -# User-Agent: * -# Disallow: / - -# Prevent spiders from downloading PDF files. -User-agent: * -Crawl-delay: 5 -Disallow: /rails/active_storage/ -Disallow: /narratives/*.pdf - -# Explicitly name certain bots to ensure they are blocked from downloading PDF files. -User-agent: Amazonbot -User-agent: Amzn-SearchBot -User-agent: AmazonProductDiscoverybot -User-agent: Bytespider -User-agent: GPTBot -User-agent: OAI-SearchBot -User-agent: ClaudeBot -User-agent: CCBot -User-agent: Googlebot -User-agent: Meta-ExternalAgent -User-Agent: PetalBot -User-agent: Turnitin -User-agent: TurnitinBot -Crawl-delay: 10 -Disallow: /rails/active_storage/ -Disallow: /narratives/*.pdf