Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 1 addition & 30 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,18 +1,14 @@
RUN = poetry run

biosample_sqlite_file = ~/biosample_basex_data_good_subset.db

.PHONY: test clean all

all: clean test examples/outputs/report.tsv assets/bibo_DocumentStatus.tsv rel_to_oxygen_example \
examples/outputs/non_attribute_metadata_sel_envs_partial.tsv
all: clean test examples/outputs/report.tsv

# ---------------------------------------
# Test runner
# ----------------------------------------
test:
$(RUN) pytest 2>&1 | tee logs/tests.log
$(RUN) pytest -sv tests/test_capitalization.py


clean:
Expand All @@ -32,28 +28,3 @@ clean:
examples/outputs/report.tsv: examples/gold.json
$(RUN) annotate-sample -R $@ $<

downloads/mixs6_core.tsv:
curl -L -s 'https://docs.google.com/spreadsheets/d/1QDeeUcDqXes69Y2RjU2aWgOpCVWo5OVsBX9MKmMqi_o/export?format=tsv&gid=178015749' > $@

examples/outputs/non_attribute_metadata_sel_envs_partial.tsv:
$(RUN) sqlite_client_cli \
--sqlite_path $(biosample_sqlite_file) \
--query "select * from non_attribute_metadata_sel_envs limit 9" \
--tsv_out $@

rel_to_oxygen_example: downloads/mixs6_core.tsv
$(RUN) rel_to_oxygen_example \
--sqlite_path $(biosample_sqlite_file) \
--mixs_core_path $<


bin/robot.jar:
curl -s https://api.github.com/repos/ontodev/robot/releases/latest | grep 'browser_download_url.*\.jar"' | cut -d : -f 2,3 | tr -d \" | wget -O $@ -i -

downloads/bibo.owl:
# --location (-L) pursues redirects
curl --location https://raw.githubusercontent.com/structureddynamics/Bibliographic-Ontology-BIBO/master/bibo.owl -o $@

assets/bibo_DocumentStatus.tsv: downloads/bibo.owl bin/robot.jar
java -jar bin/robot.jar query --input $< --query sparql/bibo_DocumentStatus.sparql $@
sed --in-place=.bak 's/^\?//' $@
67 changes: 38 additions & 29 deletions make-gold-cache.Makefile
Original file line number Diff line number Diff line change
@@ -1,30 +1,56 @@
# review and fix readmes and poetry dependencies

MAX_STUDIES=70000 # 2025-01
# see also https://github.com/microbiomedata/external-metadata-awareness/blob/751ddb6360f95f164a6605ca056e81fced59e195/Makefiles/gold.Makefile

RUN=poetry run

.PHONY: load-gold-biosamples-into-mongo

gold-to-mongo-all: gold-to-mongo-clean load-gold-biosamples-into-mongo

gold-to-mongo-clean:
rm -rf downloads/goldData.xlsx local/gold-study-ids-with-biosamples.txt

downloads/goldData.xlsx:
wget -O $@ "https://gold.jgi.doe.gov/download?mode=site_excel"
curl -o $@ "https://gold.jgi.doe.gov/download?mode=site_excel"

local/gold-studies.tsv: downloads/goldData.xlsx
poetry run python sample_annotator/file_utils/xlsx_to_tsv.py \
$(RUN) xlsx-to-tsv \
--excel-file $< \
--sheet-name Study \
--output-file $@

local/gold-study-ids.txt: local/gold-studies.tsv
# without the grep filter, this introduces some noise (non-id rows)
tail -n +2 $< | cut -f 1 | sort | grep 'Gs' > $@

local/gold-study-ids-subset.txt: local/gold-study-ids.txt
head -n $(MAX_STUDIES) $< > $@
# Extract Study GOLD IDs that have associated Biosample GOLD IDs
local/gold-study-ids-with-biosamples.txt: downloads/goldData.xlsx
date && time $(RUN) extract-study-ids-with-biosamples \
--excel-file $< \
--sheet-name 'Sequencing Project' \
--output-file $@.tmp && date # 8 minutes
sort $@.tmp | uniq > $@
rm -rf $@.tmp

# gold-to-mongo no supports both local and remote MongoDB servers with or without authentication.
#
# Environment variables (from .env file)
# MONGODB_USER: MongoDB username
# MONGODB_PASSWORD: MongoDB password

# # --purge-mongodb
# # --purge-diskcache
# # --env-file

load-gold-biosamples-into-mongo: local/gold-study-ids-with-biosamples.txt
$(RUN) gold-to-mongo \
--authentication-file config/gold-key.txt \
--log-failures-to-file local/gold-to-mongo-failures.json \
--mongo-uri "mongodb://localhost:27017/gold_metadata" \
--study-ids-file $<

local/gold-cache.json: local/gold-study-ids-subset.txt
local/gold-cache.json: local/gold-studies.tsv
# ~ 3 seconds/uncached study
# GOLD has ~ 63k studies
# < 2 days to fetch all studies ?
poetry run python sample_annotator/clients/gold_client.py \
# ~ 2.5 days to fetch all studies with no hiccups
$(RUN) python sample_annotator/clients/gold_client.py \
--verbose \
fetch-studies \
--output-format json \
Expand All @@ -33,20 +59,3 @@ local/gold-cache.json: local/gold-study-ids-subset.txt
--authentication-file config/gold-key.txt \
$<

load-gold-biosamples-into-mongo: local/gold-study-ids-subset.txt
# --purge-mongodb
# --purge-diskcache
poetry run python sample_annotator/gold_to_mongo.py \
--authentication-file config/gold-key.txt \
--mongo-db-name gold_metadata \
--study-ids-file $<

#.PHONY: split-out-gold-biosamples
#split-out-gold-biosamples: local/gold-cache.json
# poetry run python sample_annotator/file_utils/split_out_gold_biosamples.py \
# --input-file $< \
# --study-output-file local/gold-studies-only.json \
# --biosample-output-file local/gold-biosamples-only.json \
# --project-output-file local/gold-projects-only.json \
# --remove-contacts \
# --remove-nulls
Loading