diff --git a/.github/workflows/link-check.yml b/.github/workflows/link-check.yml new file mode 100644 index 0000000..f377479 --- /dev/null +++ b/.github/workflows/link-check.yml @@ -0,0 +1,46 @@ +# Check links in documentation on every push and pull request +# Uses Lychee link checker to find broken links + +name: Check Links + +on: + push: + workflow_dispatch: + schedule: + # Run weekly on Sunday at 0 AM UTC to catch external link changes + - cron: '0 9 * * 0' + +permissions: + contents: read + pull-requests: write + issues: write + +jobs: + lychee: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Run Lychee link checker + id: lychee + uses: lycheeverse/lychee-action@v2 + with: + # Check all markdown files in docs/ and README.md + args: --verbose --no-progress --exclude-private 'docs/**/*.md' 'README.md' '**/*.md' + # Fail on broken links but allow exclusions + fail: true + env: + GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} + + - name: Create Issue from Failures + if: steps.lychee.outputs.exit_code != 0 && github.event_name == 'schedule' + uses: peter-evans/create-issue-from-file@v5 + with: + title: 'Link Checker Report - Broken Links Found' + body-file: lychee/out.md + labels: | + documentation + broken-links + automated-report + diff --git a/docs/.nav.yml b/docs/.nav.yml new file mode 100644 index 0000000..4570a41 --- /dev/null +++ b/docs/.nav.yml @@ -0,0 +1,4 @@ +nav: + - Home: index.md + - Calypr: calypr/ + - Tools: tools/ diff --git a/docs/assets/banner.png b/docs/assets/banner.png new file mode 100644 index 0000000..2d2525d Binary files /dev/null and b/docs/assets/banner.png differ diff --git a/docs/assets/banner_fade.png b/docs/assets/banner_fade.png new file mode 100644 index 0000000..b66a8cf Binary files /dev/null and b/docs/assets/banner_fade.png differ diff --git a/docs/assets/calypr_family.png b/docs/assets/calypr_family.png new file mode 100644 index 0000000..c33677c Binary files /dev/null and b/docs/assets/calypr_family.png differ diff --git a/docs/assets/funnel.png b/docs/assets/funnel.png new file mode 100644 index 0000000..6830652 Binary files /dev/null and b/docs/assets/funnel.png differ diff --git a/docs/assets/git-drs.png b/docs/assets/git-drs.png new file mode 100644 index 0000000..3ed8d9d Binary files /dev/null and b/docs/assets/git-drs.png differ diff --git a/docs/assets/grip.png b/docs/assets/grip.png new file mode 100644 index 0000000..aeb6f9b Binary files /dev/null and b/docs/assets/grip.png differ diff --git a/docs/assets/logo.png b/docs/assets/logo.png new file mode 100644 index 0000000..61ac554 Binary files /dev/null and b/docs/assets/logo.png differ diff --git a/docs/calypr/.nav.yml b/docs/calypr/.nav.yml new file mode 100644 index 0000000..c8a719a --- /dev/null +++ b/docs/calypr/.nav.yml @@ -0,0 +1,10 @@ +title: Calypr +nav: + - Quick Start Guide: quick-start.md + - Troubleshooting & FAQ: troubleshooting.md + - Data Management: data-management/ + - Analysis: analysis/ + - Data Model: data-model/ + - Project Management: project-management/ + - Calypr Admin: calypr-admin/ + - Website: website/ diff --git a/docs/calypr/analysis/.nav.yml b/docs/calypr/analysis/.nav.yml new file mode 100644 index 0000000..270dace --- /dev/null +++ b/docs/calypr/analysis/.nav.yml @@ -0,0 +1,2 @@ +nav: + - Data Querying + Gen3 SDK: query.md diff --git a/docs/workflows/query.md b/docs/calypr/analysis/query.md similarity index 94% rename from docs/workflows/query.md rename to docs/calypr/analysis/query.md index 62df636..a454c49 100644 --- a/docs/workflows/query.md +++ b/docs/calypr/analysis/query.md @@ -12,20 +12,20 @@ Gen3 supports API access to Files and Metadata, allowing users to download and q ## 1. Dependency and Credentials -Prior to installing, check a profile credentials. +Prior to querying, ensure your DRS remotes are configured. Test: ```bash -g3t ping +git drs remote list ``` -- will return a list of projects that a profile has access to. +- will return a list of configured DRS remotes and projects you have access to. - For new setup or renew of gen3 credentials - Follow steps to configure/re-configure a profile with credentials: - - Download an API Key from the [Profile page](https://calypr.ohsu.edu.org/identity) and save it to `~/.gen3/credentials.json` + - Download an API Key from the [Profile page](https://calypr-public.ohsu.edu/identity) and save it to `~/.gen3/credentials.json` - ![Gen3 Profile page](../images/api-key.png) + ![Gen3 Profile page](../../images/api-key.png) - ![Gen3 Credentials](../images/credentials.png) + ![Gen3 Credentials](../../images/credentials.png) ## 2. Install diff --git a/docs/calypr/calypr-admin/.nav.yml b/docs/calypr/calypr-admin/.nav.yml new file mode 100644 index 0000000..5185daa --- /dev/null +++ b/docs/calypr/calypr-admin/.nav.yml @@ -0,0 +1,4 @@ +nav: + - Add users: add-users.md + - Role Based Access Control: approve-requests.md + - Creating a Project: creating-project.md diff --git a/docs/workflows/add-users.md b/docs/calypr/calypr-admin/add-users.md similarity index 72% rename from docs/workflows/add-users.md rename to docs/calypr/calypr-admin/add-users.md index 86cedac..8589726 100644 --- a/docs/workflows/add-users.md +++ b/docs/calypr/calypr-admin/add-users.md @@ -3,7 +3,7 @@ ## Granting user access to a project Once a project has been created you will have full access to it. -The project owner can add additional users to the project using the `g3t collaborator` commands. +The project owner can add additional users to the project using the `data-client collaborator` commands. There are two ways to request the addition additional users to the project: @@ -12,15 +12,14 @@ There are two ways to request the addition additional users to the project: To give another user full access to the project, run the following: ```sh -g3t collaborator add --write user-can-write@example.com +data-client collaborator add [project_id] user-can-write@example.com --write ``` Alternatively, to give another user read access only (without the ability to upload to the project), run the following: ```sh -g3t collaborator add user-read-only@example.com +data-client collaborator add [project_id] user-read-only@example.com ``` ## 2. Approvals -In order to implement these requests, **an authorized user will need to sign** the request before the user can use the remote repository. See `g3t collaborator approve --help -` +In order to implement these requests, **an authorized user will need to sign** the request before the user can use the remote repository. See `data-client collaborator approve --help` diff --git a/docs/workflows/approve-requests.md b/docs/calypr/calypr-admin/approve-requests.md similarity index 90% rename from docs/workflows/approve-requests.md rename to docs/calypr/calypr-admin/approve-requests.md index 0c836d0..2577139 100644 --- a/docs/workflows/approve-requests.md +++ b/docs/calypr/calypr-admin/approve-requests.md @@ -16,8 +16,8 @@ * Ony users with the steward role can approve and sign a request ```text -g3t collaborator approve --help -Usage: g3t collaborator approve [OPTIONS] +./data-client collaborator approve --help +Usage: ./data-client collaborator approve [OPTIONS] Sign an existing request (privileged). @@ -40,9 +40,9 @@ Note: This example uses the ohsu program, but the same process applies to all pr ```text ## As an admin, I need to grant data steward privileges add the requester reader and updater role on a program to an un-privileged user -g3t collaborator add add data_steward_example@.edu --resource_path /programs//projects --steward +./data-client collaborator add add data_steward_example@.edu --resource_path /programs//projects --steward # As an admin, approve that request -g3t collaborator approve +./data-client collaborator approve diff --git a/docs/calypr/calypr-admin/creating-project.md b/docs/calypr/calypr-admin/creating-project.md new file mode 100644 index 0000000..0e85113 --- /dev/null +++ b/docs/calypr/calypr-admin/creating-project.md @@ -0,0 +1,48 @@ +--- +title: Creating a Project +--- + +## CLI + +```bash +$ git drs init --help + +Usage: git drs init [OPTIONS] [PROJECT_ID] + + Initialize a new repository. + +Options: + --debug Enable debug mode. G3T_DEBUG environment variable can also be used. + --help Show this message and exit. +``` + +## Overview +The `git drs init` command initializes a new project in your current working directory. It works with existing files in the directory and creates a couple important directories: + +* `META/`: A visible directory within your project that houses the FHIR metadata files. +* `CONFIG/`: Any additional configurations that can be used to customize the gen3 data platform. + +An initialized project will look something like this... + +``` +. +├── .git // git repository state +├── META // metadata in FHIR format +├── CONFIG +└── // existing data files maintained + └── ... +``` + +## Choosing a Project ID + +> In a Gen3 Data Commons, programs and projects are two administrative nodes in the graph database that serve as the most upstream nodes. A program must be created first, followed by a project. Any subsequent data submission and data access, along with control of access to data, is done through the project scope. +> [more](https://gen3.org/resources/operator/#6-programs-and-projects) + +A project ID initializes a unique project, taking the form of program-project. A project ID is significant because it determines the location of the remote repository, bucket storage, and access control. Project IDs have a set of constraints, particularly the program name is predefined by the institution, while the project name must be unique within the server and alphanumeric without spaces. Contact an admin for a list of supported program names. + +### Authorization +While you can work with an initialized repository locally, **an authorized user will need to sign** the project request before you can push your project to the data platform. You can confirm your project authorization with `git drs ping` + +## Next steps + +- [Adding data to a project](../calypr-projects/add-files.md) \ No newline at end of file diff --git a/docs/calypr/data-management/.nav.yml b/docs/calypr/data-management/.nav.yml new file mode 100644 index 0000000..bbff100 --- /dev/null +++ b/docs/calypr/data-management/.nav.yml @@ -0,0 +1,6 @@ +nav: + - Git-DRS: git-drs.md + - Managing Metadata: meta-data.md + - Adding FHIR metadata: metadata.md + - Common Errors: common-errors.md + diff --git a/docs/calypr/data-management/common-errors.md b/docs/calypr/data-management/common-errors.md new file mode 100644 index 0000000..1c1a72b --- /dev/null +++ b/docs/calypr/data-management/common-errors.md @@ -0,0 +1,61 @@ +# Common Errors + +## .ndjson is out of date +**Error:** After `git-drs` adding and committing a file, when you go to submit your data, "DocumentReference.ndjson is out of date", +```sh +$ git add file.txt +$ git commit -m "adding file.txt" +$ git push +Please correct issues before pushing. +Command `git drs status` failed with error code 1, stderr: WARNING: DocumentReference.ndjson is out of date 1969-12-31T16:00:00. The most recently changed file is MANIFEST/file.txt.dvc 2025-02-28T09:24:46.283870. Please check DocumentReferences.ndjson +No data file changes. +``` + +``` +$ forge meta init +DocumentReference file not found at META/DocumentReference.ndjson. Creating a new one with new records. +Processed 600 records +Finished writing all DocumentReference records. +Finished writing all Directory records. +``` + +To better understand the process of adding file metadata through the manifest, see [adding file metadata](add-files.md) and [adding FHIR metadata](metadata.md). + +## No new files to index + +**Error:** +```sh +$ git drs push +No new files to index. Use --overwrite to force +``` + +**Resolution:** When pushing data, `git-drs` checks the manifest (`MANIFEST/` directory) to see if there are any files to update, including new files or modified files. If no files have been modified, then the push will not go through. To push up the same file data or push up new FHIR metadata (`META/`), use `git drs push --overwrite` + +## Uncommitted changes + +**Error:** On the subsequent rounds of adding files, updating FHIR metadata, and committing the changes, you are unable to push up those new changes +``` +$ git drs add hello.txt +$ forge meta init +$ git commit -m "add hello file" + +$ git drs push +Uncommitted changes found. Please commit or stash them first. + +$ git drs status +No data file changes. +On branch main +Changes not staged for commit: + ... + modified: META/DocumentReference.ndjson +``` + +**Resolution:** This happened because the update FHIR metadata created in the META init was not staged for commit. To stage and commit the FHIR metadata, do: + +```sh +$ git add META/ +$ git commit -m "update DocumentReference.json" +$ git drs push +``` + +Note that `git add` is used here rather than `git drs add` because `git add` will update the project's FHIR metadata while `git drs add` only updates the project's manifest. If you want to commit multiple file changes, you can also use `git commit -am "update all files"`, where all changes get committed to the project. \ No newline at end of file diff --git a/docs/calypr/data-management/git-drs.md b/docs/calypr/data-management/git-drs.md new file mode 100644 index 0000000..a24b04b --- /dev/null +++ b/docs/calypr/data-management/git-drs.md @@ -0,0 +1,153 @@ +# Git-DRS + +!!! note + The tools listed here are under development and may be subject to change. + +## Overview + +Use case: As an analyst, in order to share data with collaborators, I need a way to create a project, upload files and associate those files with metadata. The system should be capable of adding files in an incremental manner. + +The following guide details the steps a data contributor must take to submit a project to the CALYPR data commons. + +### Core Concepts + +> In a Gen3 data commons, a semantic distinction is made between two types of data: "data files" and "metadata". [more](https://gen3.org/resources/user/dictionary/#understanding-data-representation-in-gen3) + +* **Data File**: Information like tabulated data values in a spreadsheet or a fastq/bam file containing DNA sequences. The contents are not exposed to the API as queryable properties. +* **Metadata**: Variables that help to organize or convey additional information about corresponding data files so they can be queried. + +## 1. Setup + +CALYPR project management is handled using standard Git workflows. you will need the **Large File Storage (LFS)** plugin to track genomic data files and the **Git-DRS** plugin to interface with CALYPR's storage and indexing systems. + +Visit the [Quick Start Guide](../../quick-start.md) for detailed, OS-specific installation instructions for these tools. + +| Tool | Purpose | +| :--- | :--- | +| **git-drs** | Manages large file tracking, storage, and DRS indexing. | +| **forge** | Handles metadata validation, transformation (ETL), and publishing. | +| **data-client** | Administrative tool for managing [collaborators and access requests](../../tools/data-client/access_requests.md). | +{: .caption } + +## 2. Initialize Project + +Once tools are installed and credentials are configured (see [Quick Start](../../quick-start.md)), initialize your project. + +### Formatting a new project + +If you are creating a new project, you may need to initialize some of the storage parameters. These define how the DRS system stores files related to your project. + +```bash +# Clone new repository +git clone https://github.com/your-org/new-calypr-repo.git +cd new-calypr-repo + +# Initialize with full configuration +git drs init --profile calypr \ + --url https://calypr-public.ohsu.edu/ \ + --cred ~/Downloads/calypr-credentials.json \ + --project my-project-id \ + --bucket my-bucket-name +``` +*Get project details from your data coordinator if needed.* + +### Directory Structure + +An initialized project will look something like this: + +``` +/ +├── .gitattributes +├── .gitignore +├── META/ +│ ├── ResearchStudy.ndjson +│ ├── DocumentReference.ndjson +│ └── .ndjson +├── data/ +│ ├── file1.bam +│ └── file2.fastq.gz +``` + +### Verify configuration + +You'll want to double check your storage settings, to ensure you know where files are being stored. Use the DRS remote list command: + +```bash +git drs remote list +``` + +## 3. Manage Files + +By using git-lfs and git-drs you will have a number of different options to add new files to a project. + +### 3.1: Configure File Tracking + +You'll need check which files LFS is tracking. If LFS doesn't track a file, it might be uploaded to Github directly, which should be avoided for large files. + +```bash +# View current tracking +git lfs track + +# Track specific file extensions +git lfs track "*.bam" +git lfs track "*.vcf.gz" + +# Commit tracking configuration +git add .gitattributes +git commit -m "Configure LFS file tracking" +git push +``` + +### 3.2: Add Local Files + +```bash +# Add data files +git add data/sample1.bam +git add data/sample2.bam +git add results/analysis.vcf.gz + +# Verify LFS is tracking them (should show * prefix for staged LFS files) +git lfs ls-files +``` + +### 3.3: Register S3 Files + +If you have files that are already on S3, you can register them without downloading them. + +```bash +# Register file with inline credentials +git drs add-url s3://bucket-name/path/to/file.bam \ + --sha256 abc123def456... \ + --aws-access-key "your-access-key" \ + --aws-secret-key "your-secret-key" +``` + +## 4. Commit and Upload + +Once files are added or registered, commit your changes to sync with the CALYPR platform. + +```bash +# Commit files (creates DRS records via pre-commit hook) +git commit -m "Add genomic data files" + +# Upload to object store and register DRS records +git push +``` + +**What happens during push:** +1. Git-DRS creates DRS records for each tracked file. +2. Files are uploaded to the configured S3 bucket. +3. DRS URIs are registered in the Gen3 system. +4. Pointer files are committed to the repository. + +### Verifying upload + +```bash +git lfs ls-files +``` + +Files should now show `*` prefix (localized/uploaded). After completing the workflow: +* Files are visible in Git repository (as LFS pointers) +* DRS records are created +* Files are accessible via `git lfs pull` +* You can share DRS URIs with collaborators \ No newline at end of file diff --git a/docs/calypr/data-management/meta-data.md b/docs/calypr/data-management/meta-data.md new file mode 100644 index 0000000..e6b85b6 --- /dev/null +++ b/docs/calypr/data-management/meta-data.md @@ -0,0 +1,182 @@ +# Managing Metadata + +Metadata in Calypr is formatted using the Fast Healthcare Interoperability Resources (FHIR) schema. If you choose to bring your own FHIR newline delimited json data, you will need to create a directory called “META” in your git-drs repository in the same directory that you initialized your git-drs repository, and place your metadata files in that directory. +The META/ folder contains newline-delimited JSON (.ndjson) files representing FHIR resources describing the project, its data, and related entities. Large files are tracked using Git LFS, with a required correlation between each data file and a DocumentReference resource. This project follows a standardized structure to manage large research data files and associated FHIR metadata in a version-controlled, DRS and FHIR compatible format. +Each file must contain only one type of FHIR resource type, for example META/ResearchStudy.ndjson only contains research study resource typed FHIR objects. The name of the file doesn’t have to match the resource type name, unless you bring your own document references, then you must use DocumentReference.ndjson. For all other FHIR file types this is simply a good organizational practice for organizing your FHIR metadata. + +## META/ResearchStudy.ndjson + +* The File directory structure root research study is based on the 1st Research Study in the document. This research study is the research study that the autogenerated document references are connected to. Any additional research studies that are provided will be ignored when populating the miller table file tree. +* Contains at least one FHIR ResearchStudy resource describing the project. +* Defines project identifiers, title, description, and key attributes. + +## META/DocumentReference.ndjson + +* Contains one FHIR DocumentReference resource per Git LFS-managed file. +* Each `DocumentReference.content.attachment.url` field: + * Must exactly match the relative path of the corresponding file in the repository. + * Example: + +```json +{ + "resourceType": "DocumentReference", + "id": "docref-file1", + "status": "current", + "content": [ + { + "attachment": { + "url": "data/file1.bam", + "title": "BAM file for Sample X" + } + } + ] +} +``` + +Place your custom FHIR `.ndjson` files in the `META/` directory: + +```bash +# Copy your prepared FHIR metadata +cp ~/my-data/patients.ndjson META/ +cp ~/my-data/observations.ndjson META/ +cp ~/my-data/specimens.ndjson META/ +cp ~/my-data/document-references.ndjson META/ +``` + +## Other FHIR data + +\[TODO More intro text here\] + +* Patient.ndjson: Participant records. +* Specimen.ndjson: Biological specimens. +* ServiceRequest.ndjson: Requested procedures. +* Observation.ndjson: Measurements or results. +* Other valid FHIR resource types as required. + +Ensure your FHIR `DocumentReference` resources reference the DRS URIs: + +Example `DocumentReference` linking to S3 file: + +```json +{ + "resourceType": "DocumentReference", + "id": "doc-001", + "status": "current", + "content": [{ + "attachment": { + "url": "drs://calypr-public.ohsu.edu/your-drs-id", + "title": "sample1.bam", + "contentType": "application/octet-stream" + } + }], + "subject": { + "reference": "Patient/patient-001" + } +} +``` + + +--- + +## Validating Metadata + +To ensure that the FHIR files you have added to the project are correct and pass schema checking, you can use the [Forge tool](../../tools/forge/index.md). + +```bash +forge validate +``` + +Successful output: + +✓ Validating META/patients.ndjson... OK +✓ Validating META/observations.ndjson... OK +✓ Validating META/specimens.ndjson... OK +✓ Validating META/document-references.ndjson... OK +All metadata files are valid. + +Fix any validation errors and re-run until all files pass. + + +### Forge Data Quality Assurance Command Line Commands + +If you have provided your own FHIR resources there are two commands that might be useful to you for ensuring that your FHIR metadata will appear on the CALYPR data platform as expected. These commands are validate and check-edge + +**Validate:** +```bash +forge validate META +# or +forge validate META/DocumentReference.ndjson +``` +Validation checks if the provided directory or file will be accepted by the CALYPR data platform. It catches improper JSON formatting and FHIR schema errors. + +**Check-edge:** +```bash +forge check-edge META +# or +forge validate META/DocumentReference.ndjson +``` +Check-edge ensures that references within your files (e.g., a Patient ID in an Observation) connect to known vertices and aren't "orphaned". + +### Validation Process + +#### 1\. Schema Validation + +* Each .ndjson file in META/ (like ResearchStudy.ndjson, DocumentReference.ndjson, etc.) is read line by line. +* Every line is parsed as JSON and checked against the corresponding FHIR schema for that resourceType. +* Syntax errors, missing required fields, or invalid FHIR values trigger clear error messages with line numbers. + +#### 2\. Mandatory Files Presence + +* Confirms that: + * ResearchStudy.ndjson exists and has at least one valid record. + * DocumentReference.ndjson exists and contains at least one record. +* If either is missing or empty, validation fails. + +#### 3\. One-to-One Mapping of Files to DocumentReference + +* Scans the working directory for Git LFS-managed files in expected locations (e.g., data/). +* For each file, locates a corresponding DocumentReference resource whose content.attachment.url matches the file’s relative path. +* Validates: + * All LFS files have a matching DocumentReference. + * All DocumentReferences point to existing files. + +#### 4\. Project-level Referential Checks + +* Validates that DocumentReference resources reference the same ResearchStudy via relatesTo or other linking mechanisms. +* If FHIR resources like Patient, Specimen, ServiceRequest, Observation are present, ensures: + * Their id fields are unique. + * DocumentReference correctly refers to those resources (e.g., via subject or related fields). + +#### 5\. Cross-Entity Consistency + +* If multiple optional FHIR .ndjson files exist: + * Confirms IDs referenced in one file exist in others. + * Detects dangling references (e.g., a DocumentReference.patient ID that's not in Patient.ndjson). + +--- + +#### ✅ Example Error Output + +ERROR META/DocumentReference.ndjson line 4: url "data/some\_missing.bam" does not resolve to an existing file +ERROR META/Specimen.ndjson line 2: id "specimen-123" referenced in Observation.ndjson but not defined + +--- + +#### 🎯 Purpose & Benefits + +* Ensures all files and metadata are in sync before submission. +* Prevents submission failures due to missing pointers or invalid FHIR payloads. +* Enables CI integration, catching issues early in the development workflow. + +--- + +#### Validation Requirements + +Automated tools or CI processes must: + +* Verify presence of META/ResearchStudy.ndjson with at least one record. +* Verify presence of META/DocumentReference.ndjson with one record per LFS-managed file. +* Confirm every DocumentReference.url matches an existing file path. +* Check proper .ndjson formatting. + +--- \ No newline at end of file diff --git a/docs/workflows/metadata.md b/docs/calypr/data-management/metadata.md similarity index 72% rename from docs/workflows/metadata.md rename to docs/calypr/data-management/metadata.md index f44204a..d21b861 100644 --- a/docs/workflows/metadata.md +++ b/docs/calypr/data-management/metadata.md @@ -1,6 +1,5 @@ # Adding FHIR metadata - ## Background Adding files to a project is a two-step process: @@ -8,20 +7,20 @@ Adding files to a project is a two-step process: 1. Adding file metadata entries to the manifest (see [adding files](add-files.md)) 2. Creating FHIR-compliant metadata using the manifest -This page will guide you through the second step of generating FHIR metadata in your g3t project. To understand the FHIR data model, see [FHIR for Researchers](../data-model/introduction.md) +This page will guide you through the second step of generating FHIR metadata in your `git-drs` project. To understand the FHIR data model, see [FHIR for Researchers](../data-model/introduction.md) -## Generating FHIR Data using g3t +## Generating FHIR Data using git-drs -To submit metadata from the manifest to the platform, that metadata needs to be converted into FHIR standard. We will use the file metadata entries we had created during the `g3t add` on our data files. +To submit metadata from the manifest to the platform, that metadata needs to be converted into FHIR standard. We will use the file metadata entries we had created during the `git drs add` on our data files. ### Creating metadata files using the manifest -Using the file metadata entries created by the `g3t add` command, `g3t meta init` creates FHIR-compliant metadata files in the `META/` directory, where each file corresponds to a [FHIR resource](https://build.fhir.org/resourcelist.html). At a minimum, this directory will create: +Using the file metadata entries created by the `git drs add` command, `forge meta init` creates FHIR-compliant metadata files in the `META/` directory, where each file corresponds to a [FHIR resource](https://build.fhir.org/resourcelist.html). At a minimum, this directory will create: | File | Contents | -|--------------------------|----------------------------| -| ResearchStudy.ndjson | Description of the project | -| DocumentReference.ndjson | File information | +22: |--------------------------|----------------------------| +23: | ResearchStudy.ndjson | Description of the project | +24: | DocumentReference.ndjson | File information | Depending on if a `patient` or `specimen` flag was specified, other resources can be added to the metadata files: @@ -32,19 +31,18 @@ Depending on if a `patient` or `specimen` flag was specified, other resources ca * measurements (Observation) -* This command will create a skeleton metadata file for each file added to the project using the `patient`, `specimen`, `task`, and/or `observation` flags specified by the `g3t add` command. +* This command will create a skeleton metadata file for each file added to the project using the `patient`, `specimen`, `task`, and/or `observation` flags specified by the `git drs add` command. * You can edit the metadata to map additional fields. * The metadata files can be created at any time, but the system will validate them before the changes are committed. * **Note:** If an existing file is modified, it won't get automatically staged - For instance, if `DocumentReference.json` is already created and it has to be updated to reflect an additional file, this change is not automatically staged. - - Make sure to either `git add META/` or use the `-a` flag in `g3t commit` to ensure that your FHIR metadata changes are staged. + - Make sure to either `git add META/` to ensure that your FHIR metadata changes are staged. ### Example To add a cram file that's associated with a subject, sample, and particular task ```sh -g3t add myfile.cram --patient P0 --specimen P0-BoneMarrow --task_id P0-Sequencing -g3t meta init +git add myfile.cram --patient P0 --specimen P0-BoneMarrow --task_id P0-Sequencing ``` This will produce metadata with the following relationships: @@ -54,8 +52,8 @@ This will produce metadata with the following relationships: When the project is committed, the system will validate new or changed records. You may validate the metadata on demand by: ```sh -$ g3t meta validate --help -Usage: g3t meta validate [OPTIONS] DIRECTORY +$ forge meta validate --help +Usage: forge meta validate [OPTIONS] DIRECTORY Validate FHIR data in DIRECTORY. @@ -73,16 +71,15 @@ All FHIR metadata is housed in the `META/` directory. The convention of using a ## Supplying your own FHIR metadata -In some cases, it might be useful to supply your own FHIR metadata without using `g3t add` to create any file metadata. In that case, adding metadata would take on the following flow: +In some cases, it might be useful to supply your own FHIR metadata without using `git drs add` to create any file metadata. In that case, adding metadata would take on the following flow: 1. Initialize your project 2. Copy external FHIR data as `.ndjson` files to your `META/` directory 3. `git add META/` -4. `g3t commit -m "supplying FHIR metadata"` +4. `git commit -m "supplying FHIR metadata"` This process would be useful for individuals who want to use the system to track relations between metadata but might not necessarily want to connect their actual data files to the system. ## Next Steps -* See the tabular metadata section for more information on working with metadata. -* See the commit and push section for more information on publishing. \ No newline at end of file +See the [data management section](../data-management/meta-data.md) for more information on working with metadata and publishing. \ No newline at end of file diff --git a/docs/calypr/data-model/.nav.yml b/docs/calypr/data-model/.nav.yml new file mode 100644 index 0000000..10af5e2 --- /dev/null +++ b/docs/calypr/data-model/.nav.yml @@ -0,0 +1,4 @@ +nav: + - Integrating your data: integration.md + - FHIR for Researchers: introduction.md + - Creating and Uploading Metadata: metadata.md diff --git a/docs/data-model/integration.md b/docs/calypr/data-model/integration.md similarity index 86% rename from docs/data-model/integration.md rename to docs/calypr/data-model/integration.md index b3f9608..3d4fa13 100644 --- a/docs/data-model/integration.md +++ b/docs/calypr/data-model/integration.md @@ -5,9 +5,9 @@ Converting tabular data (CSV, TSV, spreadsheet, database table) into FHIR (Fast As you create a upload files, you can tag them with identifiers which by default will create minimal, skeleton graph. -You can retrieve that data using the g3t command line tool, and update the metadata to create a more complete graph representing your study. +You can retrieve that data using the [git-drs](../../tools/git-drs/index.md) command line tool, and update the metadata using [forge](../../tools/forge/index.md) to create a more complete graph representing your study. -You may choose to work with the data in it's "native" json format, or convert it to a tabular format for integration. The system will re-convert tabular data back to json for submittal. +You may choose to work with the data in its "native" JSON format, or convert it to a tabular format for integration. The system will re-convert tabular data back to JSON for submittal. The process of integrating your data into the graph involves several steps: @@ -24,12 +24,12 @@ The process of integrating your data into the graph involves several steps: * Normalize Data: Split the spreadsheet data into FHIR-compliant resources. * Step 4: Utilize provided FHIR Tooling or Libraries - * FHIR Tooling: Use `g3t meta dataframe ` and associated libraries to support data conversion and validation. - * Validation: Use `g3t meta validate` to validate the transformed data against FHIR specifications to ensure compliance and accuracy. + * FHIR Tooling: Use `forge meta` and associated libraries to support data conversion and validation. + * Validation: Use `forge validate` to validate the transformed data against FHIR specifications to ensure compliance and accuracy. * Step 5: Import into FHIR-Compatible System - * Load Data: Use `g3t commit` to load the transformed data into the calypr system. - * Testing and Verification: Use `g3t push` to ensure your data appears correctly in the portal and analysis tools. + * Load Data: Use `git commit` and `git push` to manage your local data state. + * Testing and Verification: Ensure your data appears correctly in the portal and analysis tools after a successful push. * Step 6: Iterate and Refine * Review and Refine: Check for any discrepancies or issues during the import process. Refine the conversion process as needed. @@ -76,7 +76,7 @@ Identifiers in FHIR references typically include the following components: [see > A string, typically numeric or alphanumeric, that is associated with a single object or entity within a given system. Typically, identifiers are used to connect content in resources to external content available in other frameworks or protocols. -System: Indicates the system or namespace to which the identifier belongs. By default the namespace is `http://calypr.ohsu.edu.org/`. +System: Indicates the system or namespace to which the identifier belongs. By default the namespace is `http://calypr-public.ohsu.edu/`. Value: The actual value of the identifier within the specified system. For instance, a lab controlled subject identifier or a specimen identifier. @@ -109,4 +109,4 @@ By using identifiers in references, FHIR ensures that data can be accurately lin > A reference to a document of any kind for any purpose. [see more](https://hl7.org/fhir/documentreference.html) -See the metadata workflow section for more information on how to create and upload metadata. +See the [data management section](../data-management/meta-data.md) for more information on how to create and upload metadata. diff --git a/docs/data-model/introduction.md b/docs/calypr/data-model/introduction.md similarity index 74% rename from docs/data-model/introduction.md rename to docs/calypr/data-model/introduction.md index 96599e2..e32b47e 100644 --- a/docs/data-model/introduction.md +++ b/docs/calypr/data-model/introduction.md @@ -5,6 +5,12 @@ Given all of the intricacies healthcare and experimental data, we use Fast Healt ## What is FHIR? +> In a Gen3 data commons, a semantic distinction is made between two types of data: "data files" and "metadata". [more](https://gen3.org/resources/user/dictionary/#understanding-data-representation-in-gen3) + +A "data file" could be information like tabulated data values in a spreadsheet or a fastq/bam file containing DNA sequences. The contents of the file are not exposed to the API as queryable properties, so the file must be downloaded to view its content. + +"Metadata" are variables that help to organize or convey additional information about corresponding data files so that they can be queried via the Gen3 data commons’ API or viewed in the Gen3 data commons’ data exploration tool. In a Gen3 data dictionary, variable names are termed "properties", and data contributors provide the values for these pre-defined properties in their data submissions. + In an era where healthcare information is abundant yet diverse and often siloed, FHIR emerges as a standard, empowering research analysts to navigate, aggregate, and interpret health data seamlessly. This guide aims to unravel the intricacies of FHIR, equipping research analysts with the knowledge and tools needed to harness the potential of interoperable healthcare data for insightful analysis and impactful research outcomes in the context of CALYPR collaborations. ## Graph Model @@ -21,7 +27,7 @@ The following "file focused" example illustrates how CALYPR uses FHIR resources Examine [resource](https://www.hl7.org/fhir/resource.html) definitions [here](http://www.hl7.org/fhir/resource.html): -* Details on [uploaded files](https://calypr.github.io/workflows/upload/) are captured as [DocumentReference](http://www.hl7.org/fhir/documentreference.html) +* Details on uploaded files are captured as [DocumentReference](http://www.hl7.org/fhir/documentreference.html) * DocumentReference.[subject](https://www.hl7.org/fhir/documentreference-definitions.html#DocumentReference.subject) indicates who or what the document is about: * Can simply point to the [ResearchStudy](https://hl7.org/fhir/researchstudy.html), to indicate the file is part of the study diff --git a/docs/data-model/metadata.md b/docs/calypr/data-model/metadata.md similarity index 88% rename from docs/data-model/metadata.md rename to docs/calypr/data-model/metadata.md index c12298a..c085af9 100644 --- a/docs/data-model/metadata.md +++ b/docs/calypr/data-model/metadata.md @@ -62,6 +62,6 @@ gen3_util meta publish /tmp/$PROJECT_ID ## View the Files -This final step uploads the metadata associated with the project and makes the files visible on the [Explorer page](https://calypr.ohsu.edu.org/explorer). +This final step uploads the metadata associated with the project and makes the files visible on the [Explorer page](https://calypr-public.ohsu.edu/Explorer). -![Gen3 File Explorer](./explorer.png) +![Gen3 File Explorer](../website/explorer.png) diff --git a/docs/calypr/index.md b/docs/calypr/index.md new file mode 100644 index 0000000..57efbd9 --- /dev/null +++ b/docs/calypr/index.md @@ -0,0 +1,43 @@ +# CALYPR Platform Overview + +Welcome to the **CALYPR Platform**. CALYPR is a next-generation genomic data science ecosystem designed to bridge the gap between massive, centralized data commons and the agile, distributed workflows of modern researchers. + +--- + +## The CALYPR Philosophy + +Traditional data repositories often create "data silos" where information is easy to store but difficult to move, version, or integrate with external tools. CALYPR breaks these silos by embracing **Interoperability**, **Reproducibility**, and **Scalability**. + +### 1. Interoperability (GA4GH Standards) +CALYPR is built from the ground up on [GA4GH](https://www.ga4gh.org/) standards. By using the **Data Repository Service (DRS)** and **Task Execution Service (TES)**, CALYPR ensures that your data and workflows can move seamlessly between different cloud providers and on-premises high-performance computing (HPC) clusters. + +### 2. Reproducibility (Git-like Data Management) +The core of the CALYPR experience is **Git-DRS**. We believe that data management should feel as natural as code management. Git-DRS allows you to track, version, and share massive genomic datasets using the familiar `git` commands, ensuring that every analysis is backed by a specific, immutable version of the data. + +### 3. Scalability (Hybrid Cloud Infrastructure) +Whether you are working with a few genomes or petabyte-scale cohorts, CALYPR's architecture—powered by **Gen3**—scales to meet your needs. Our hybrid cloud approach allows for secure data storage in AWS while leveraging your local compute resources when necessary. + +--- + +## How it Works: The Connected Commons + +CALYPR acts as the "connective tissue" between your research environment and the cloud: + +* **Data Commons (Gen3):** Provides the robust backend for metadata management, indexing, and authentication. +* **Version Control ([Git-DRS](../tools/git-drs/index.md)):** Manages the "smudge" and "clean" operations for large files, allowing you to treat remote DRS objects as local files. +* **Metadata Orchestration ([Forge](../tools/forge/index.md)):** Streamlines the validation, publishing, and harmonizing of genomic metadata. +* **Compute ([Funnel](../tools/funnel/index.md)):** Executes complex pipelines across distributed environments using standardized task definitions. +* **Graph Insights ([GRIP](../tools/grip/index.md)):** Enables high-performance queries across heterogeneous datasets once integrated. + +--- + +## Next Steps +To get moving on the "Happy Path," we recommend following these steps: + +1. **[Quick Start Guide](quick-start.md):** The fastest way to install tools and start tracking data. +2. **[Data & Metadata](data-management/meta-data.md):** Learn how to associate your biological samples with the files you've uploaded. + +--- + +!!! info "Private Beta" + CALYPR is currently in a private beta phase. We are actively working with a select group of research partners to refine the platform. If you encounter any issues or have feature requests, please reach out to the team. \ No newline at end of file diff --git a/docs/calypr/project-management/.nav.yml b/docs/calypr/project-management/.nav.yml new file mode 100644 index 0000000..abc0374 --- /dev/null +++ b/docs/calypr/project-management/.nav.yml @@ -0,0 +1,4 @@ +nav: + - Create a Project (gen3 + GitHub): create-project.md + - Project Customization: custom-views.md + - Publishing project: publishing-project.md diff --git a/docs/calypr/project-management/create-project.md b/docs/calypr/project-management/create-project.md new file mode 100644 index 0000000..474f144 --- /dev/null +++ b/docs/calypr/project-management/create-project.md @@ -0,0 +1,17 @@ + + +# Create a Project (gen3 \+ GitHub) + +Status: *Manual and DevOps‑only at the moment* + +The standard way to start a new Calypr project is to create a Git repository that will hold your FHIR NDJSON files and a set of Git‑LFS tracked files. + +For now you will need to ask a Calypr management team to create the project and provide you with the following: + +* GitHub repository URL +* Calypr project ID +* Initial git config settings (branch, remotes, etc.) + +Future Work: Automate this step with a CLI wizard. + +TODO – Write the DevOps‑only project creation guide. diff --git a/docs/calypr/project-management/custom-views.md b/docs/calypr/project-management/custom-views.md new file mode 100644 index 0000000..afebfdf --- /dev/null +++ b/docs/calypr/project-management/custom-views.md @@ -0,0 +1,186 @@ + +# Project Customization + +## Dataframer Configuration + +The dataframer is used to render the FHIR `.ndjson` files into the tabular space that is used in the explorer page table. If you want to customize your project’s explorer page you will need to specify database field names that are defined in the dataframer, thus you will need to run the dataframer on your data ahead of time in order to know these field names. + +See below steps for setting up `git-drs` and running dataframer commands: + +```bash +python -m venv venv +source venv/bin/activate +pip install gen3-tracker==0.0.7rc27 +git-drs meta dataframe DocumentReference +``` + +The explorer config is a large JSON document. One of the keys of note is `guppyConfig`, which is used to specify what index is to be used for the explorer page tab that you have defined. Notice that when you run `git-drs meta dataframe` it outputs: + +```text +Usage: git-drs meta dataframe [OPTIONS] {Specimen|DocumentReference|ResearchSubject|MedicationAdministration|GroupMember} [DIRECTORY_PATH] [OUTPUT_PATH] + +Try 'git-drs meta dataframe --help' for help. +``` + +Where `Specimen`, `DocumentReference`, etc. are the supported indices that can be run in the dataframe and defined in the `explorerConfig` under the `guppyConfig` key. + +Note that the `guppyConfig` index names use `snake_case` formatting whereas the dataframer uses uppercase for each word. + +## 5.2 Explorer Page Configuration + +Forge currently supports customization of explorer pages by routing to: `https://commons-url/Explorer/[program]-[project]` + +Explorer Configs can be customized by running `forge config init` and then filling out the template configuration. + +The explorer config is a JSON document with a top-level key called `explorerConfig` which can host a list of "tab" configs. The tabs (e.g., "Patient", "Specimen", and "File") each denote an element in this config. + +In this example, the `guppyConfig.dataType` is set to `document_reference`. We ran the `DocumentReference` dataframer command earlier to select database field names from the generated output. + +```json +{ + "explorerConfig": [ + { + "tabTitle": "TEST", + "guppyConfig": { + "dataType": "document_reference", + "nodeCountTitle": "file Count", + "fieldMapping": [] + }, + "filters": { + "tabs": [ + { + "title": "Filters", + "fields": [ + "document_reference_assay", + "document_reference_creation", + "project_id" + ], + "fieldsConfig": { + "project_id": { + "field": "project_id", + "label": "Project Id", + "type": "enum" + }, + "assay": { + "field": "document_reference_assay", + "label": "Assay", + "type": "enum" + }, + "creation": { + "field": "document_reference_creation", + "label": "Creation", + "type": "enum" + } + } + } + ] + }, + "table": { + "enabled": true, + "fields": [ + "project_id", + "document_reference_assay", + "document_reference_creation" + ], + "columns": { + "project_id": { + "field": "project_id", + "title": "Project ID" + }, + "assay": { + "field": "document_reference_assay", + "title": "Assay" + }, + "creation": { + "field": "document_reference_creation", + "title": "Creation" + } + } + }, + "dropdowns": {}, + "buttons": [], + "loginForDownload": false + } + ] +} +``` + +And here is what this config looks like in the frontend: + +Note that since there is only one element in the `explorerConfig` there is only one tab called “TEST” in the explorer page which is housed as `tabTitle` in the config. + +#### Filters + +The next important section is the `filters` key. This defines the filters column on the left-hand side of the page. Within that block there is the `fields` key and the `fieldsConfig` key. The `fields` key is used to specify the names of the fields that you want to filter on. In order to get the names of the fields you will need to install `git-drs` via PyPI and run a dataframer command which essentially creates this explorer table dataframe, so that you can configure in the frontend what parts of this dataframe you want to be shown. + +Now, going back to the configuration, these fields that were specified come directly from the column names at the top of the excel spreadsheet that are generated from running the dataframer command. You can choose any number / combination of these column names, but note that in any list that is specified in this config, the elements in the list are rendered in the frontend in that exact order that is specified. + +The `fieldsConfig` key is a decorator dict that is optional but can be applied to every filter that is specified. Notice that the `label` key is used to denote the preferred display name that is to be used for the database key name that was taken from the dataframer excel spreadsheet. + +#### Table + +The last import section is the `table` key. Like with the filters structure, `fields` is used to denote all of the database column names that should be displayed in the explorer table. Also similar to the filters structure, `columns` is where you specify the label that you want displayed for the database field. In this case it is `field` is the db name and `title` is the label display name. + +The rest of the config is templating that is needed for the explorer page to load, but not anything that is directly useful. + +#### Shared Filters + +Imagine you want to filter on multiple index facets, similar to a RESTFUL join operation. Like for example give me all of the PATIENTS who belong on this `project_id` that also have a specimen that matches this `project_id`. + +This is known as “shared filtering” because you are making the assumption that you want to carry your filters over to the new node when you click a new tab. This only works if there exists an equivalent field on the other index/tab, so it must be configurable and is not applicable for all normal filterable fields. + +It sounds complex but setting it up isn't that complex at all. Simply specify a filter that you want to do shared filtering on, ie: `project_id`, then specify the indices and the field names for each index that the field is shared on. For our purposes `project_id` is known as `project_id` on all indices but this may not always be the case, and proper inspection or knowledge of the dataset may be required to determine this. + +Then you simply specify each “shared filter” as a JSON dictionary list element under the field that you have specified and you have successfully setup shared filtering on that field. In order to define additional shared filters, it is as simple as adding another key under the `defined` dictionary key and specifying a list of indices and fields that the shared filter can be joined on. See the example below for details. + +```json +"sharedFilters": { + "defined": { + "project_id": [ + { "index": "research_subject", "field": "project_id" }, + { "index": "specimen", "field": "project_id" }, + { "index": "document_reference", "field": "project_id" } + ] + } +} +``` + +## 5.3 Configurator + +Now that you have the basics down this frontend GUI might start to make some sense. Notice this is the exact same config that was shown earlier, except it is customizable via the GUI so that you don’t need to wrestle with the JSON to get a working, correctly formatted config. Notice also that there is a 3rd column here: Charts. Charts are defined very simply: + +```json +"charts": { + "specimen_collection": { + "chartType": "fullPie", + "title": "Metastasis Site" + } +} +``` + +Just provide the DB column name as the parent key, and then the chart type and the label title of the chart. The chart will generate a binned histogram counts style chart. Currently only `fullPie`, `bar` or `donut` type charts are supported but in the future other chart types might be added. + +As stated earlier, configs have a very specific naming convention: `[program]-[project].json` and will be rejected if you do not have write permissions on the program, project configuration that is specified or if the name of the configuration is not of that form. You can also load any configs that you have access to too, an edit them and then repost them. + +All customizable explorer pages are viewable when routing to `/Explorer/[program]-[project]` assuming that all database fields that are specified exist in the db. + +# **Advanced Docs** + +--- + +# **🧬 Managing Identifiers with CALYPR Meta** + +This guide explains how to manage dataset identifiers, both manually and through the command line, and how those identifiers integrate with Git-LFS and git-drs for reproducible, FAIR-compliant data management. + +### 🧭 Introduction: Where This Fits in Your Research Data Lifecycle + +This document applies once you’ve begun organizing data files for a research study and are ready to make their metadata machine-readable and FAIR-compliant. Researchers typically progress through several stages: + +1. **Files only**: you start with a set of raw or processed data files associated with a research study. +2. **Files with identifiers**: each file is linked to key entities such as Patients, Specimens, or Assays using `META/identifiers.tsv`. +3. **Files with identifiers + attributes**: you begin adding structured tabular metadata (e.g., `Patient.tsv`, `Specimen.tsv`, `Observation.tsv`) describing those entities. +4. **Files with complete FHIR metadata**: you can now transform these TSVs into fully-formed FHIR resources (`Patient.ndjson`, `Specimen.ndjson`, etc.) suitable for sharing, indexing, and integration with clinical or genomic data platforms. + +This guide focuses on stage 2, 3 — converting well-structured TSV metadata files into standard FHIR resources, while validating that every entity’s identifier corresponds to the entries defined in `META/identifiers.tsv`. + +--- \ No newline at end of file diff --git a/docs/calypr/project-management/publishing-project.md b/docs/calypr/project-management/publishing-project.md new file mode 100644 index 0000000..9494f15 --- /dev/null +++ b/docs/calypr/project-management/publishing-project.md @@ -0,0 +1,53 @@ +## 4.6: Publishing changes to Gen3 + +In order to publish metadata to CALYPR, regardless of whether you have provided your own metadata or you are simply uploading files to the system, you will need to publish your data. Publishing data is done with the **Forge** command line utility. + +Since Forge relies on your GitHub repository to know which files should have metadata records on the CALYPR platform, a GitHub Personal Access Token (PAT) is needed. To create your own PAT, login to [https://source.ohsu.edu](https://source.ohsu.edu), go to Settings > Tokens, and click "Generate new token". Make sure the token has `clone` permissions at the minimum. + +To publish, run: +```bash +forge publish [your_PAT] +``` + +### Publishing Process + +To publish your metadata, run the following command: + +```bash +forge publish +``` + +What happens: + +1. Forge validates your GitHub Personal Access Token +2. Packages repository information +3. Submits a Sower job to Gen3 +4. Gen3 ingests FHIR metadata from META/ +5. Metadata becomes searchable in CALYPR + +Successful output: + +✓ Personal Access Token validated +✓ Repository information packaged +✓ Sower job submitted: job-id-12345 +✓ Metadata ingestion started + +Check job status: forge status \ +Get all job ids: forge list + +📖 More details: [Forge Publish Command](https://github.com/copilot/tools/forge/commands.md#forge-publish) + +--- + +### Verification Checklist + +After completing the workflow: + +* LFS pointer files in Git repository +* DRS records created +* DRS URIs point to S3 locations +* Metadata files validated successfully +* Sower job completed without errors +* Data searchable in CALYPR web interface +* Can query patients/observations in Gen3 +* Files accessible via S3 (no duplicate storage) \ No newline at end of file diff --git a/docs/calypr/quick-start.md b/docs/calypr/quick-start.md new file mode 100644 index 0000000..12458e3 --- /dev/null +++ b/docs/calypr/quick-start.md @@ -0,0 +1,139 @@ +--- +title: Quick Start Guide +--- + +# Quick Start Guide + +To get started with CALYPR, you will need to install [git-lfs](https://git-lfs.github.com/) and [git-drs](https://github.com/calypr/git-drs), a "git" like command line tool for uploading and downloading files to the [gen3 platform](https://gen3.org/). + +### Git-LFS Installation Instructions + +To use CALYPR, you must first install [Git Large File Storage (LFS)](https://git-lfs.github.com/) on your system. This allows Git to efficiently handle the large genomic data files. + +=== "macOS" + **Install using Homebrew** + ```bash + brew install git-lfs + ``` + +=== "Linux" + **Install via Package Manager** + + === "Debian/Ubuntu" + `sudo apt-get install git-lfs` + + === "RHEL/CentOS" + `sudo yum install git-lfs` + + === "Fedora" + `sudo dnf install git-lfs` + +=== "Windows" + **Download and Run Installer** + Download the latest [Git LFS Windows installer](https://github.com/git-lfs/git-lfs/releases/latest) and follow the setup instructions. + + +**Initialize Git LFS** +Run the following command in your terminal to complete the setup: +```bash +git lfs install --skip-smudge +``` + +## Project Setup + +You first need to set up a project and initialize it: + +``` +mkdir MyNewCalyprProject +cd MyNewCalyprProject +git init +git drs init +``` + +Now that you have initialized your project you have created a very primitive Git Large File Support (LFS) backed git repository. + +## Download Gen3 API Credentials + +To use the git-drs, you need to configure `git-drs` with API credentials downloaded from the [Profile page](https://calypr-public.ohsu.edu/Profile). + +![Gen3 Profile page](../images/profile.png) + +Log into the website. Then, download the access key from the portal and save it in the standard location `~/.gen3/credentials.json` + +![Gen3 API Key](../images/api-key.png) + +![Gen3 Credentials](../images/credentials.png) + +### Configure a git-drs repository with a Gen3 Credential. + +Now that you have a Gen3 API credential, you can attach the credential to your git-drs +repository by adding it as a drs remote. + +git-drs requires a bucket name and a project id to defined in this command. + +The bucket name is the name of the s3 bucket that you plan to upload your data to. This bucket must be configured inside the calypr instance. contact to setup a calypr bucket. + +the project id must be in the form ORGANIZATION-PROJECTNAME. + +From the command line from within your new porject, run the git-drs remote add command: + +=== "Example Command" + ```sh + git-drs remote add gen3 \ + --cred= \ + --project + --bucket + ``` + +=== "Mac/Linux" + ```sh + git-drs remote add gen3 cbds \ + --cred=~/Downloads/credentials.json \ + --project testProgram-testProject \ + --bucket testBucket + + ``` +=== "Windows" + ```sh + git-drs remote add gen3 cbds \ + --cred=C:\Users\demo\Downloads\credentials.json \ + --project testProgram-testProject \ + --bucket testBucket + ``` + +You can confirm your configuration and access by listing your remotes: + +```sh +git drs remote list +``` + +This will show your configured profiles and the projects you have access to. + + +## Remaining Execution + +From this point forward, git-drs functions exactly like git-lfs, see [git-lfs documentation](https://github.com/git-lfs/git-lfs/tree/main/docs?utm_source=gitlfs_site&utm_medium=docs_link&utm_campaign=gitlfs) for more in depth documentation. + +An example of uploading a file to calypr and downloading it can be viewed below: + +### Upload Files + +``` +# Track files +git lfs track "*.bam" +git add .gitattributes + +# Add and commit files +git add my-file.bam +git commit -m "Add data file" +git push +``` +### Downloading existing files + +``` +git clone mylfsrepo +cd mylfsrepo +git drs init +git drs remote add gen3 myProfile --cred ~/.gen3/credentials.json --project cbds-my_lfs_repo --bucket cbds +git lfs pull -I "*.bam" +``` diff --git a/docs/calypr/troubleshooting.md b/docs/calypr/troubleshooting.md new file mode 100644 index 0000000..9b69555 --- /dev/null +++ b/docs/calypr/troubleshooting.md @@ -0,0 +1,61 @@ +# Troubleshooting & FAQ + +Common issues encountered when working with the CALYPR platform and its tools. + +--- + +## Metadata is "out of date" + +**Issue:** When attempting to push or validate, you receive a warning that `DocumentReference.ndjson` or other metadata files are out of date. + +**Resolution:** This typically happens when you have added new data files using `git add` or `git-drs` but haven't updated the corresponding FHIR metadata to reflect these changes. + +1. **Regenerate Metadata:** Use Forge to synchronize your metadata with the current repository state: + ```bash + forge meta init + ``` +2. **Stage Changes:** Ensure the updated metadata files in the `META/` directory are staged: + ```bash + git add META/ + ``` +3. **Commit:** + ```bash + git commit -m "Update metadata for new files" + ``` + +--- + +## No new files to index + +**Issue:** Running `git push` or a registration command returns "No new files to index." + +**Resolution:** This indicates that the current state of your files is already synchronized with the remote server. If you need to force an update to the metadata or re-register existing files, use the specific tool's overwrite flag (e.g., `git drs push --overwrite`). + +--- + +## Uncommitted changes preventing push + +**Issue:** You receive an error about "Uncommitted changes found" when trying to push data. + +**Resolution:** Standard Git rules apply. If you've run commands that modify the `META/` directory, you must commit those changes before pushing. +```bash +git add META/ +git commit -m "Refining metadata" +git push +``` + +--- + +## Authentication Errors + +**Issue:** Commands fail with "Unauthorized" or "401" errors. + +**Resolution:** +1. **Check Credentials:** Ensure your `credentials.json` is valid and hasn't expired. You can download a fresh key from the [CALYPR Profile Page](https://calypr-public.ohsu.edu/Profile). +2. **Verify Configuration:** Run `git drs remote list` to ensure the correct endpoint and project ID are configured for your current profile. +3. **Token Refresh:** If using temporary tokens, ensure they are still active. + +--- + +!!! tip "Getting Help" + If your issue isn't listed here, please reach out to our team at [support@calypr.org](mailto:support@calypr.org) or search the individual tool documentation in the [Tools Section](../tools/index.md). diff --git a/docs/calypr/website/.nav.yml b/docs/calypr/website/.nav.yml new file mode 100644 index 0000000..352e519 --- /dev/null +++ b/docs/calypr/website/.nav.yml @@ -0,0 +1,3 @@ +nav: + - Download: portal-download.md + - Explore: portal-explore.md diff --git a/docs/calypr/website/download-single-file.png b/docs/calypr/website/download-single-file.png new file mode 100644 index 0000000..46f3125 Binary files /dev/null and b/docs/calypr/website/download-single-file.png differ diff --git a/docs/calypr/website/explorer.png b/docs/calypr/website/explorer.png new file mode 100644 index 0000000..d8e89e5 Binary files /dev/null and b/docs/calypr/website/explorer.png differ diff --git a/docs/calypr/website/file-manifest.png b/docs/calypr/website/file-manifest.png new file mode 100644 index 0000000..0e0cb31 Binary files /dev/null and b/docs/calypr/website/file-manifest.png differ diff --git a/docs/calypr/website/portal-download.md b/docs/calypr/website/portal-download.md new file mode 100644 index 0000000..68117ff --- /dev/null +++ b/docs/calypr/website/portal-download.md @@ -0,0 +1,31 @@ +--- +title: Download +--- + +There are two main ways to download files: + +1. Individually through the browser or through the command line with the `gen3-client` +2. Batch downloads through the command line with `git-drs` and `git-lfs` + +This guide will walk you through both methods below. + +--- + +### Batch Download with Git-DRS + +To retrieve the actual data files described by a repository, you must clone the repository and use `git lfs pull`. + +```bash +# 1. Clone the repository +git clone +cd + +# 2. Initialize Git-DRS +git drs init + +# 3. Add the DRS remote (see Quick Start for details) +git drs remote add gen3 calypr --project --bucket --cred ~/.gen3/credentials.json + +# 4. Pull the files +git lfs pull +``` \ No newline at end of file diff --git a/docs/calypr/website/portal-explore.md b/docs/calypr/website/portal-explore.md new file mode 100644 index 0000000..231c253 --- /dev/null +++ b/docs/calypr/website/portal-explore.md @@ -0,0 +1,9 @@ + +# Explore + +The `push` command uploads the metadata associated with the project and makes the files visible on the [Explorer page](https://calypr-public.ohsu.edu/Explorer). + +![Gen3 File Explorer](./explorer.png) + + +See the [portal download page](portal-download.md) for more information on downloading files from the portal. diff --git a/docs/getting-started.md b/docs/getting-started.md deleted file mode 100644 index 6aa94c3..0000000 --- a/docs/getting-started.md +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: Getting Started ---- - -{% include '/note.md' %} - -Use case: As an analyst, in order to share data with collaborators, I need a way to create a project, upload files and associate those files with metadata. The system should be capable of adding files in an incremental manner. - -The following guide details the steps a data contributor must take to submit a project to the CALYPR data commons. - -> In a Gen3 data commons, a semantic distinction is made between two types of data: "data files" and "metadata". [more](https://gen3.org/resources/user/dictionary/#understanding-data-representation-in-gen3) - -A "data file" could be information like tabulated data values in a spreadsheet or a fastq/bam file containing DNA sequences. The contents of the file are not exposed to the API as queryable properties, so the file must be downloaded to view its content. - -"Metadata" are variables that help to organize or convey additional information about corresponding data files so that they can be queried via the Gen3 data commons’ API or viewed in the Gen3 data commons’ data exploration tool. In a Gen3 data dictionary, variable names are termed "properties", and data contributors provide the values for these pre-defined properties in their data submissions. - -For the CALYPR data commons, we have created a data dictionary based on the FHIR data standard. The data dictionary is available [here](https://github.com/bmeg/iceberg-schema-tools) - -## Examples - -> In a Gen3 Data Commons, programs and projects are two administrative nodes in the graph database that serve as the most upstream nodes. A program must be created first, followed by a project. Any subsequent data submission and data access, along with control of access to data, is done through the project scope. -> [more](https://gen3.org/resources/operator/#6-programs-and-projects) - -For the following examples, we will use the `calypr` program with a project called `myproject`, please use the `g3t projects ls` command to verify what programs you have access to. diff --git a/docs/images/api-key.png b/docs/images/api-key.png index bf27e88..6adaf3e 100644 Binary files a/docs/images/api-key.png and b/docs/images/api-key.png differ diff --git a/docs/images/credentials-json.png b/docs/images/credentials-json.png deleted file mode 100644 index c35016c..0000000 Binary files a/docs/images/credentials-json.png and /dev/null differ diff --git a/docs/images/credentials.png b/docs/images/credentials.png index 89f68c7..d1b783e 100644 Binary files a/docs/images/credentials.png and b/docs/images/credentials.png differ diff --git a/docs/images/file-manifest-download copy.png b/docs/images/file-manifest-download copy.png new file mode 100644 index 0000000..d8e89e5 Binary files /dev/null and b/docs/images/file-manifest-download copy.png differ diff --git a/docs/images/file-manifest-download.png b/docs/images/file-manifest-download.png deleted file mode 100644 index 9cc4717..0000000 Binary files a/docs/images/file-manifest-download.png and /dev/null differ diff --git a/docs/images/login.png b/docs/images/login.png deleted file mode 100644 index 37af6c9..0000000 Binary files a/docs/images/login.png and /dev/null differ diff --git a/docs/images/profile.png b/docs/images/profile.png index c69fb96..411f06a 100644 Binary files a/docs/images/profile.png and b/docs/images/profile.png differ diff --git a/docs/index.md b/docs/index.md index 7ad8184..ed0c43b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,16 +1,116 @@ -# Welcome to the CALYPR Documentation +--- +template: home.html +hide: + - navigation + - toc + - header +--- -![CALYPR site](./images/website_header.png) + + -This documentation will walk you through the steps for submitting data to the [CALYPR Data Commons](https://calypr.ohsu.edu.org). +
+

CALYPR Platform

+

+ A scalable, hybrid cloud infrastructure designed for the demands of modern genomics research. + Built on open-source standards, CALYPR provides GA4GH-compliant tools for seamless data integration, analysis, and biological insights. Based on the Gen3 Data Commons architecture, CALYPR empowers analysts to manage large-scale genomic datasets and integrate data to build new predictive models. +

+ +
-## About -The [gen3-tracker](https://github.com/CALYPR/gen3_util/) (g3t) command line utility is a combination of tools that facilitate data sharing on the CALYPR platform. It allows you to create a unified data project, upload files, and associate those files with metadata in an incremental manner. Submitted data with g3t gives you all the benefits the data platform offers: data indexing, data exploration, consolidated access, and more! +
-The following guide details the steps a data contributor must take to submit a project to the CALYPR data commons. -## Getting Started +
+

Built on Open Standards

+
+
+
TES
+
Task Execution Service. A GA4GH standard for distributed task execution to enable federated computing.
+
+
+
DRS
+
Data Reference System. A GA4GH standard for data discovery and access.
+
+
+
FHIR
+
Healthcare Interoperability. Exchanging patient health information.
+
+
+
JSON Hyper-Schema
+
JSON-Schema + Graph data. Represent complex and high quality data.
+
+
+ +
-To navigate through each page, use pages list in the top left or using the navigation arrow on the bottom left and right! Otherwise, check out our [requirements](requirements.md) page to get started. +
-![Main landing page for CALYPR IDP](./images/main-page.png) +
+ + + + +
+
+ GRIP +
+
+

GRIP

+

Graph-based data integration for complex research datasets.

+

High-performance graph query engine that provides a unified interface across MongoDB, SQL, and key-value stores. Ideal for complex relational discovery in genomics.

+ Learn more +
+
+ + +
+
+ Funnel +
+
+

Funnel

+

Distributed task execution for petabyte-scale pipelines.

+

Standardized batch computing using the GA4GH TES API. Run Docker-based tasks seamlessly across AWS, Google Cloud, and Kubernetes at any scale.

+ Learn more +
+
+ + +
+
+ Git-DRS +
+
+

Git-DRS

+

Secure data repository system with version control.

+

Manage large-scale genomic data with integrated versioning and metadata management, ensuring reproducibility and data integrity throughout research cycles.

+ Learn more +
+
+
+ +
+ +
+

Join the Beta

+

+ CALYPR is currently in private beta. If you are interested in early access or a demonstration of the platform, please reach out to us at + sales@calypr.com. In the meantime, you can explore our GitHub repository and get access to all of our open source tools. +

+
diff --git a/docs/note.md b/docs/note.md deleted file mode 100644 index 1b65636..0000000 --- a/docs/note.md +++ /dev/null @@ -1,2 +0,0 @@ -!!! note - The tools listed here are under development and may be subject to change. diff --git a/docs/requirements.md b/docs/requirements.md deleted file mode 100644 index 62eb900..0000000 --- a/docs/requirements.md +++ /dev/null @@ -1,154 +0,0 @@ ---- -title: Requirements ---- - -# Requirements - -## 1. Download gen3-client - -gen3-client to upload and download files to the [gen3 platform](https://gen3.org/). Since the CALYPR is built on gen3, gen3-client is used in gen3-tracker (g3t) for the same purpose. See the instructions below for how to download gen3-client for your operating system. - -### Installation Instructions - - -=== "macOS" - 1. Download the [macOS version](https://github.com/CALYPR/cdis-data-client/releases/latest/download/gen3-client-macos.pkg) of the gen3-client. - 2. Run the gen3-client pkg, following the instructions in the installer. - 3. Open a terminal window. - 4. Create a new gen3 directory: `mkdir ~/.gen3` - 5. Move the executable to the gen3 directory: `mv /Applications/gen3-client ~/.gen3/gen3-client` - 6. Change file permissions: `chown $USER ~/.bash_profile` - 7. Add the gen3 directory to your PATH environment variable: `echo 'export PATH=$PATH:~/.gen3' >> ~/.bash_profile` - 8. Refresh your PATH: `source ~/.bash_profile` - 9. Check that the program is downloaded: run `gen3-client` - - -=== "Linux" - 1. Download the [Linux version](https://github.com/CALYPR/cdis-data-client/releases/latest/download/gen3-client-linux-amd64.zip) of the gen3-client. - 2. Unzip the archive. - 3. Open a terminal window. - 4. Create a new gen3 directory: `mkdir ~/.gen3` - 5. Move the unzipped executable to the gen3 directory: `~/.gen3/gen3-client` - 6. Change file permissions: `chown $USER ~/.bash_profile` - 7. Add the gen3 directory to your PATH environment variable: `echo 'export PATH=$PATH:~/.gen3' >> ~/.bash_profile` - 8. Refresh your PATH: `source ~/.bash_profile` - 9. Check that the program is downloaded: run `gen3-client` - -=== "Windows" - 1. Download the [Windows version](https://github.com/CALYPR/cdis-data-client/releases/latest/download/gen3-client-windows-amd64.zip) of the gen3-client. - 2. Unzip the archive. - 3. Add the unzipped executable to a directory, for example: `C:\Program Files\gen3-client\gen3-client.exe` - 4. Open the Start Menu and type "edit environment variables". - 5. Open the option "Edit the system environment variables". - 6. In the "System Properties" window that opens up, on the "Advanced" tab, click on the "Environment Variables" button. - 7. In the box labeled "System Variables", find the "Path" variable and click "Edit". - 8. In the window that pops up, click "New". - 9. Type in the full directory path of the executable file, for example: `C:\Program Files\gen3-client` - 10. Click "Ok" on all the open windows and restart the command prompt if it is already open by entering cmd into the start menu and hitting enter. - -## 2. Configure a gen3-client Profile with Credentials - -To use the gen3-client, you need to configure `gen3-client` with API credentials downloaded from the [Profile page](https://calypr.ohsu.edu.org/Profile). - -![Gen3 Profile page](images/profile.png) - -Log into the website. Then, download the access key from the portal and save it in the standard location `~/.gen3/credentials.json` - -![Gen3 Credentials](images/credentials.png) - -From the command line, run the gen3-client configure command: - -=== "Example Command" - ```sh - gen3-client configure \ - --profile= \ - --cred= \ - --apiendpoint=https://calypr.ohsu.edu.org - ``` - -=== "Mac/Linux" - ```sh - gen3-client configure \ - --profile=calypr \ - --cred=~/Downloads/credentials.json \ - --apiendpoint=https://calypr.ohsu.edu.org - ``` -=== "Windows" - ```sh - gen3-client configure \ - --profile=calypr \ - --cred=C:\Users\demo\Downloads\credentials.json \ - --apiendpoint=https://calypr.ohsu.edu.org - ``` - -Run the `gen3-client auth` command to confirm you configured a profile with the correct authorization privileges. Then, to list your access privileges for each project in the commons you have access to: - -```sh -gen3-client auth --profile=calypr - -# 2023/12/05 15:07:12 -# You have access to the following resource(s) at https://calypr.ohsu.edu.org: -# 2023/12/05 15:07:12 /programs/calypr/projects/myproject... -``` - -## 3. Install gen3-tracker (g3t) - -The `gen3-tracker (g3t)` tool requires a working Python 3 installation no older than [Python 3.12](https://www.python.org/downloads/release/python-3120/). Check your version with `python3 --version`. If needed, download a compatible version of [Python 3](https://www.python.org/downloads/). - -Optionally, create a virtual environment using venv or conda for g3t. We will use [venv](https://docs.python.org/3/library/venv.html) in the instructions. - -``` -python3 -m venv venv; source venv/bin/activate -``` - -Run the following in your working directory to install the latest version of g3t from the Python Package Index: - -```sh -pip install gen3-tracker -``` - -You can verify the installation was successful by then running the `g3t` command with the expected output being the [latest version](https://pypi.org/project/gen3-tracker/#history): - -```sh -g3t --version -``` - -### Upgrading g3t - -This version should match the latest version on the [PyPi page](https://pypi.org/project/gen3-tracker/). If it is out of date, run the following to upgrade your local version: - -```sh -pip install -U gen3-tracker -``` - -### Configuration - -g3t uses the [gen3-client](https://gen3.org/resources/user/gen3-client/#2-configure-a-profile-with-credentials) configuration flow. - -After configuration, you can either specify the `--profile` or set the `G3T_PROFILE=profile-name` environmental variable. - -### Testing the configuration - -The command `g3t ping` will confirm that the access key and gen3-client have been configured correctly - -```sh -g3t --profile calypr ping -``` - -A successful ping will output something like: - -> msg: 'Configuration OK: Connected using profile:calypr' -> -> endpoint: https://calypr.ohsu.edu.org -> -> username: someone@example.com -> -> bucket_programs: -> -> ... -> -> your_access: -> -> ... - -With g3t completely set up, see the [Quickstart Guide](/workflows/quick-start-guide) for how to upload and download data to a project. diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css index 638e7e8..7872c27 100644 --- a/docs/stylesheets/extra.css +++ b/docs/stylesheets/extra.css @@ -1,10 +1,29 @@ +@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap'); + /* Prevent the '$' character in shell blocks from being copied */ .gp { user-select: none; } -h1, h2, h3 { - font-weight: bold !important; +:root { + --md-primary-fg-color: #0057B7; + --md-primary-fg-color--light: #4698CA; + --card-background: #ffffff; + --card-shadow: 0 4px 20px rgba(0, 0, 0, 0.08); + --card-shadow-hover: 0 12px 30px rgba(0, 0, 0, 0.12); + --text-muted: #64748b; + --transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1); +} + +body { + font-family: 'Inter', -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; +} + +h1, +h2, +h3 { + font-weight: 700 !important; + letter-spacing: -0.02em; } /* horizontal dividers */ @@ -13,13 +32,277 @@ h1, h2, h3 { display: block; width: 100%; height: 1px; - background-color: lightgrey; + background-color: #e2e8f0; margin-top: 0.5em; margin-bottom: 1.5em; } -/* colors */ -:root > * { - --md-primary-fg-color: #0057B7; - --md-primary-fg-color--light: #4698CA; +/* Hero section container */ +.md-hero { + background-image: linear-gradient(135deg, var(--md-primary-fg-color), #1e40af); + background: + linear-gradient(rgba(0, 48, 102, 0.4), rgba(0, 48, 102, 0.4)), + url("../assets/banner_fade.png"); + background-size: cover; + background-position: center; + color: white; + padding: 6rem 0; + clip-path: ellipse(150% 100% at 50% 0%); +} + +.md-hero__inner { + display: flex; + flex-direction: column; + align-items: center; + text-align: center; +} + +.md-hero__content h1 { + font-size: 3rem; + font-weight: 800; + margin-bottom: 1rem; + text-shadow: 0 2px 10px rgba(0, 0, 0, 0.1); +} + +.md-hero__content div { + font-size: 1.25rem; + max-width: 40rem; + margin-bottom: 2rem; + opacity: 0.95; + font-weight: 400; +} + +/* Product Grid */ +.product-grid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(320px, 1fr)); + gap: 2rem; + max-width: 1100px; + margin: 0rem auto 4rem; + padding: 0 1rem; + position: relative; + z-index: 10; +} + +/* Professional Product Card */ +.product-card { + background: var(--card-background); + border-radius: 12px; + box-shadow: var(--card-shadow); + overflow: hidden; + transition: var(--transition); + border: 1px solid rgba(226, 232, 240, 0.8); + display: flex; + flex-direction: column; +} + +.product-card:hover { + transform: translateY(-8px); + box-shadow: var(--card-shadow-hover); + border-color: var(--md-primary-fg-color--light); +} + +.product-card--featured:hover { + transform: none; +} + +/* Featured / Umbrella Card */ +.product-card--featured { + grid-column: 1 / -1; + flex-direction: column; + min-height: 500px; + background: white; +} + +.product-card--featured .product-card__image-wrap { + width: 100%; + height: 650px; + border-bottom: none; + border-right: none; + padding: 0; + background: #f8fafc; +} + +/* Gradient transition from image to text */ +.product-card--featured .product-card__image-wrap::after { + content: ""; + position: absolute; + bottom: 0; + left: 0; + width: 100%; + height: 80px; + background: linear-gradient(to bottom, transparent, white); + z-index: 2; +} + +.product-card--featured .product-card__image { + width: 100%; + height: 100%; + max-width: none; + max-height: none; + object-fit: cover; + object-position: center 20%; + z-index: 1; +} + +.product-card--featured .product-card__content { + padding: 2rem 4rem 4rem; + text-align: center; + align-items: center; +} + +.product-card--featured .product-card__title { + font-size: 2.8rem; + margin-bottom: 1rem; +} + +.product-card--featured .product-card__summary { + font-size: 1.4rem; + margin-bottom: 1.5rem; + max-width: 800px; } + +.product-card--featured .product-card__description { + font-size: 1.1rem; + max-width: 750px; + margin-bottom: 2rem; +} + +.product-card__image-wrap { + position: relative; + width: 100%; + height: 180px; + background: #f8fafc; + border-bottom: 1px solid #f1f5f9; + display: flex; + align-items: center; + justify-content: center; + overflow: hidden; +} + +.product-card__image { + max-width: 80%; + max-height: 80%; + object-fit: contain; + transition: var(--transition); +} + +.product-card:hover .product-card__image { + transform: scale(1.05); +} + +.product-card--featured:hover .product-card__image { + transform: scale(1); +} + +.product-card__content { + padding: 1.5rem; + flex-grow: 1; + display: flex; + flex-direction: column; +} + +.product-card__title { + color: #0f172a; + font-size: 1.5rem; + font-weight: 700; + margin-bottom: 0.75rem; +} + +.product-card__summary { + color: #334155; + font-size: 0.95rem; + font-weight: 500; + margin-bottom: 0.75rem; + line-height: 1.4; +} + +.product-card__description { + color: var(--text-muted); + font-size: 0.875rem; + line-height: 1.6; + margin-bottom: 1.5rem; + flex-grow: 1; +} + +.product-card__link { + display: inline-flex; + align-items: center; + color: var(--md-primary-fg-color); + font-weight: 600; + font-size: 0.95rem; + text-decoration: none; + transition: var(--transition); +} + +.product-card__link i { + margin-left: 0.25rem; + transition: var(--transition); +} + +.product-card__link:hover { + color: #1a73e8; +} + +.product-card__link:hover i { + transform: translateX(4px); +} + +.product-card--featured .product-card__link:hover i { + transform: none; +} + +/* Responsive */ +@media screen and (max-width: 768px) { + .product-grid { + grid-template-columns: 1fr; + margin-top: 2rem; + } + + .md-hero__content h1 { + font-size: 2.25rem; + } + + .product-card--featured { + flex-direction: column; + } + + .product-card--featured .product-card__image-wrap { + width: 100%; + height: 200px; + border-right: none; + border-bottom: none; + padding: 2rem 2rem 0; + } + + .product-card--featured .product-card__content { + padding: 1.5rem; + } + + .product-card--featured .product-card__title { + font-size: 1.75rem; + } +} + +/* Sidebar Navigation Styles */ +/* Target top-level section headers, nested collapsible headers, and index links */ +.md-nav__item--section>.md-nav__link, +.md-nav__item--section>label.md-nav__link, +.md-nav__item--section>.md-nav__link--index, +.md-nav__link[for], +.md-nav__item--nested>.md-nav__link, +.md-nav__link--index { + font-weight: 700 !important; + color: #000000 !important; + opacity: 1 !important; +} + +.md-header { + z-index: 11; +} + +/* Adjust top-level sidebar items that are links to follow a similar weight */ +.md-nav--primary>.md-nav__list>.md-nav__item>.md-nav__link { + font-weight: 700; + color: #000000; +} \ No newline at end of file diff --git a/docs/tools/.nav.yml b/docs/tools/.nav.yml new file mode 100644 index 0000000..ddc8347 --- /dev/null +++ b/docs/tools/.nav.yml @@ -0,0 +1 @@ +title: Tools diff --git a/docs/tools/data-client/.nav.yml b/docs/tools/data-client/.nav.yml new file mode 100644 index 0000000..6c70a75 --- /dev/null +++ b/docs/tools/data-client/.nav.yml @@ -0,0 +1,6 @@ +title: Data Client +nav: + - Welcome: index.md + - Authentication: authentication.md + - Data Management: data_management.md + - Access Requests: access_requests.md diff --git a/docs/tools/data-client/access_requests.md b/docs/tools/data-client/access_requests.md new file mode 100644 index 0000000..e5750bd --- /dev/null +++ b/docs/tools/data-client/access_requests.md @@ -0,0 +1,68 @@ +--- +title: Access & Collaboration +--- + +# Access & Collaboration + +The `data-client` includes tools to manage user access and collaboration through the **Requestor** service. This allows project administrators to invite users (collaborators) to projects and manage access requests. + +## Managing Collaborators + +The `collaborator` command suite is used to add or remove users from projects. + +### Add a User + +To give a user access to a project: + +```bash +./data-client collaborator add [project_id] [username] --profile= +``` + +- **project_id**: Format `program-project` (e.g., `SEQ-Res`). +- **username**: The user's email address. + +**Options:** +- `--write` (`-w`): Grant write access. +- `--approve` (`-a`): Automatically approve the request (if you have admin permissions). + +### Remove a User + +To revoke access: + +```bash +./data-client collaborator rm [project_id] [username] --profile= +``` + +**Options:** +- `--approve` (`-a`): Automatically approve the revocation. + +## Managing Requests + +### List Requests + +List access requests associated with you or a user. + +```bash +./data-client collaborator ls --profile= +``` + +**Options:** +- `--mine`: List your requests. +- `--active`: List only active requests. +- `--username`: List requests for a specific user (admin only). + +### List Pending Requests + +See requests waiting for approval. + +```bash +./data-client collaborator pending --profile= +``` + +### Approve a Request + +If you are a project administrator, you can approve pending requests. + +```bash +./data-client collaborator approve [request_id] --profile= +``` diff --git a/docs/tools/data-client/authentication.md b/docs/tools/data-client/authentication.md new file mode 100644 index 0000000..fa37a1e --- /dev/null +++ b/docs/tools/data-client/authentication.md @@ -0,0 +1,45 @@ +--- +title: Authentication & Access +--- + +# Authentication & Access (Fence) + +The `data-client` uses the **Fence** service to manage authentication and user access privileges. + +## Authentication Setup + +Authentication is handled via the `configure` command using an API Key credential file. See [Configuration](index.md#configuration) for details. + +When you run a command, the `data-client`: +1. Validates your API Key. +2. Requests a temporary Access Token from Fence. +3. Uses this Access Token for subsequent API calls. + +If your Access Token has expired, the client automatically refreshes it using your API Key. + +## Checking Privileges + +You can verify your current access privileges and see which projects/resources you have access to using the `auth` command. + +### Command + +```bash +./data-client auth --profile= +``` + +### Example Usage + +```bash +./data-client auth --profile=mycommons +``` + +### Output + +The command lists the resources (projects) you can access and the specific permissions you have for each (e.g., read, write, delete). + +```text +You have access to the following resource(s) at https://data.mycommons.org: + +/programs/program1/projects/projectA [read, read-storage, write-storage] +/programs/program1/projects/projectB [read] +``` diff --git a/docs/tools/data-client/data_management.md b/docs/tools/data-client/data_management.md new file mode 100644 index 0000000..30dfbb9 --- /dev/null +++ b/docs/tools/data-client/data_management.md @@ -0,0 +1,64 @@ +--- +title: Data Management +--- + +# Data Management + +The `data-client` facilitates secure data transfer between your local environment and the Gen3 Data Commons using the **Indexd** (indexing) and **Fence** (authentication) services. + +## Uploading Data + +You can upload files or directories for registration and storage in the Data Commons. The process handles: +1. Registering the file with `Indexd` (creating a GUID). +2. Obtaining a presigned URL from `Fence`. +3. Uploading the file content to object storage (e.g., S3). + +### Command + +```bash +./data-client upload --profile= --upload-path= +``` + +### Options + +- `--upload-path`: Path to a single file, a folder, or a glob pattern (e.g., `data/*.bam`). +- `--batch`: Enable parallel uploads for better performance. +- `--numparallel`: Number of parallel uploads (default: 3). +- `--bucket`: Target bucket (if not using default). +- `--metadata`: Look for `[filename]_metadata.json` sidecar files to upload metadata alongside the file. + +### Example + +Upload a single file: +```bash +./data-client upload --profile=mycommons --upload-path=data/sample.bam +``` + +Upload a directory with parallel processing: +```bash +./data-client upload --profile=mycommons --upload-path=data/ --batch --numparallel=5 +``` + +## Downloading Data + +You can download data using their GUIDs (Globally Unique Identifiers). + +### Command + +```bash +./data-client download --profile= --guid= +``` + +### Options + +- `--guid`: The GUID of the file to download. +- `--no-prompt`: Skip overwrite confirmation prompts. +- `--dir`: Target directory for download (default: current directory). + +To download multiple files, you can use the `download-multiple` functionality (often via manifest, check `./data-client download --help` for specific usages as they may vary). + +### Example + +```bash +./data-client download --profile=mycommons --guid=dg.1234/5678-abcd +``` diff --git a/docs/tools/data-client/index.md b/docs/tools/data-client/index.md new file mode 100644 index 0000000..2a6000c --- /dev/null +++ b/docs/tools/data-client/index.md @@ -0,0 +1,57 @@ +--- +title: Data Client +--- + +# Data Client + +The `data-client` is the modern CALYPR client library and CLI tool. It serves two primary purposes: +1. **Data Interaction**: A unified interface for uploading, downloading, and managing data in Gen3 Data Commons. +2. **Permissions Management**: It handles user access and project collaboration, replacing older tools like `calypr_admin`. + +## Architecture + +The `data-client` is built upon a modular architecture centered around the `Gen3Interface`. This interface acts as a facade, coordinating interactions with specific Gen3 services. + +```mermaid +graph TD + CLI[Data Client CLI] --> G3I[Gen3Interface] + G3I --> Auth[Fence Client] + G3I --> Idx[Indexd Client] + G3I --> Job[Sower Client] + G3I --> Req[Requestor Client] + + Auth --> |Authentication/Tokens| FenceService((Fence Service)) + Idx --> |File Registration| IndexdService((Indexd Service)) + Job --> |Job Submission| SowerService((Sower Service)) + Req --> |Access Requests| RequestorService((Requestor Service)) +``` + +### Components + +The `data-client` integrates the following Gen3 clients: + +- **Fence Client**: Handles authentication (API keys, Access Tokens) and presigned URL generation for data access. +- **Indexd Client**: Manages file registration (GUIDs), indexing, and file location resolution. +- **Sower Client**: Manages job submissions and monitoring (e.g., for data analysis workflows). +- **Requestor Client**: Handles data access requests and collaboration management. + +## Configuration + +The `data-client` uses a configuration profile system to manage credentials for different Gen3 commons. + +Configuration is stored in `~/.gen3/gen3_client_config.ini`. + +### Setting up a Profile + +To configure a new profile, you need an API Key (Credential file) downloaded from the Gen3 Commons profile page. + +```bash +./data-client configure --profile= --cred= --apiendpoint= +``` + +Example: +```bash +./data-client configure --profile=mycommons --cred=credentials.json --apiendpoint=https://data.mycommons.org +``` + +Once configured, you can use the `--profile` flag in other commands to target this environment. diff --git a/docs/tools/forge/.nav.yml b/docs/tools/forge/.nav.yml new file mode 100644 index 0000000..ab1e833 --- /dev/null +++ b/docs/tools/forge/.nav.yml @@ -0,0 +1,6 @@ +title: Forge +nav: + - Overview: index.md + - Validation: validation.md + - Publishing: publishing.md + - Configuration: configuration.md diff --git a/docs/tools/forge/configuration.md b/docs/tools/forge/configuration.md new file mode 100644 index 0000000..26dcf7f --- /dev/null +++ b/docs/tools/forge/configuration.md @@ -0,0 +1,125 @@ +--- +title: Configuration +--- + +# Forge Configuration + +Forge manages the configuration for the CALYPR Explorer UI. This configuration defines how data is displayed, filtered, and accessed in the web interface. + +## Creating a Configuration + +You can generate a starter configuration template for your project using the `forge config` command. + +```bash +forge config --remote +``` + +This command: +1. Reads the Project ID from your specified remote (or default remote). +2. Creates a `CONFIG` directory if it doesn't exist. +3. Generates a template JSON file named `.json` inside `CONFIG/`. + +**Example:** + +```bash +forge config --remote production +``` + +If your project ID is `my-project`, this creates `CONFIG/my-project.json`. + +## Editing Configuration + +The configuration is a standard JSON file. You can edit it with any text editor. + +### Top-Level Structure + +The configuration is an array of objects, where each object represents a **Tab** in the data explorer (e.g., "Patients", "Samples", "Files"). + +```json +{ + "ExplorerConfig": [ + { + "tabTitle": "Research Subject", + "filters": { ... }, + "table": { ... }, + "guppyConfig": { ... } + } + ] +} +``` + +### Key Components + +#### `tabTitle` +The display name of the tab in the UI. + +#### `guppyConfig` +Defines the connection to the backend index (Guppy). + +- `dataType`: The index type in Guppy (e.g., "patient", "file"). +- `nodeCountTitle`: Label for the count of items (e.g., "Patients"). +- `accessibleFieldCheckList`: Fields to check for access control (usually `["project_id"]`). + +#### `table` +Configures the data table displayed in the tab. + +- `enabled`: Set to `true` to show the table. +- `fields`: Array of field names to include in the table data. +- `columns`: Dictionary defining how each field is rendered. + - `title`: Column header text. + - `cellRenderFunction`: Optional custom renderer (e.g., "HumanReadableString" for file sizes). + +#### `filters` +Configures the faceted search filters on the left sidebar. + +- `tabs`: Grouping of filters. + - `fields`: List of fields to show as filters. + - `fieldsConfig`: Custom labels for the filters. + +## Example Configuration + +Here is a simplified example configuration for a "Research Subject" tab: + +```json +{ + "ExplorerConfig": [ + { + "tabTitle": "Research Subject", + "guppyConfig": { + "dataType": "researchsubject", + "nodeCountTitle": "Research Subjects", + "accessibleFieldCheckList": ["project_id"] + }, + "filters": { + "tabs": [ + { + "fields": ["project_id", "gender", "race"], + "fieldsConfig": { + "project_id": { "label": "Project" }, + "gender": { "label": "Gender" } + } + } + ] + }, + "table": { + "enabled": true, + "fields": ["project_id", "submitter_id", "gender", "race"], + "columns": { + "project_id": { "title": "Project" }, + "submitter_id": { "title": "ID" }, + "gender": { "title": "Gender" }, + "race": { "title": "Race" } + } + } + } + ] +} +``` + +## Validation + +After editing your configuration, always validate it to ensure there are no syntax errors or invalid structures. + +```bash +forge validate config --path CONFIG/my-project.json +``` diff --git a/docs/tools/forge/index.md b/docs/tools/forge/index.md new file mode 100644 index 0000000..5d10a27 --- /dev/null +++ b/docs/tools/forge/index.md @@ -0,0 +1,48 @@ +--- +title: Forge +--- + +# Forge + +Forge is the CALYPR metadata management tool. It streamlines the validation, publishing, and management of data dictionaries and metadata schemas for Gen3 Data Commons. + +## Core Features + +- **Validation**: Validate your data and schemas against the Gen3 data model. +- **Publishing**: Publish schemas and metadata to a Gen3 instance. +- **Metadata Management**: Tools to query and manipulate metadata. + +## Commands + +### `validate` + +The `validate` command suite is used to ensure your data and configurations are correct before submission. + +- **`forge validate config `**: Validates a configuration file. +- **`forge validate data `**: Validates data files (e.g., JSON, TSV) against the schema. +- **`forge validate edge `**: Validates relationships (edges) between data nodes. + +### `publish` + +Manage the publishing lifecycle of your data schemas. + +- **`forge publish`**: Publish the current schema/metadata to the configured environment. +- **`forge publish status`**: Check the status of a publishing job. +- **`forge publish list`**: List available publication resources. +- **`forge publish output`**: Retrieve the output of a publication process. + +### `meta` + +Tools for handling metadata directly. + +```bash +forge meta [subcommand] +``` + +### `config` + +Manage Forge configuration settings. + +```bash +forge config +``` diff --git a/docs/tools/forge/publishing.md b/docs/tools/forge/publishing.md new file mode 100644 index 0000000..060848b --- /dev/null +++ b/docs/tools/forge/publishing.md @@ -0,0 +1,68 @@ +--- +title: Publishing +--- + +# Publishing + +The `forge` tool handles the lifecycle of publishing metadata to Gen3 Commons via the **Sower** service (for async job processing). + +## Publishing Metadata + +To start a new metadata publication job: + +```bash +forge publish [flags] +``` + +This command submits a job to the Sower service. + +**Arguments:** +- ``: A GitHub Personal Access Token (PAT) is required by the backend worker to access the repository containing the metadata schema. + +**Flags:** +- `--remote`, `-r`: Target remote DRS server name (default: "default_remote"). + +**Output:** +Returns the Job UID, Name, and initial Status. + +```text +Uid: 12345-abcde Name: metadata-publish Status: PENDING +``` + +## Monitoring Jobs + +### List Jobs + +View all jobs cataloged in Sower. + +```bash +forge publish list [flags] +``` + +**Flags:** +- `--remote`, `-r`: Target remote DRS server. + +### Check Status + +Check the status of a specific job by its UID. + +```bash +forge publish status [flags] +``` + +**Flags:** +- `--remote`, `-r`: Target remote DRS server. + +### View Logs + +Retrieve the output logs of a specific job. + +```bash +forge publish output [flags] +``` + +**Flags:** +- `--remote`, `-r`: Target remote DRS server. + +**Output:** +Displays the raw logs from the backend job execution, which is useful for debugging failures. diff --git a/docs/tools/forge/validation.md b/docs/tools/forge/validation.md new file mode 100644 index 0000000..f6a4df2 --- /dev/null +++ b/docs/tools/forge/validation.md @@ -0,0 +1,81 @@ +--- +title: Validation +--- + +# Validation + +The `forge validate` command suite ensures that your metadata and configuration files adhere to the expected formats and schemas. This is a critical step before publishing data to a Gen3 Commons. + +## Validate Data + +Validates FHIR-based metadata files (NDJSON format) against a JSON schema. + +```bash +forge validate data [flags] +``` + +By default, it looks for files in a `META` directory or can be pointed to a specific file/directory. + +**Flags:** +- `--path`, `-p`: Path to metadata file(s) or directory to validate (default: `META`). + +**Behavior:** +- Checks if files are valid NDJSON. +- Validates each row against the corresponding JSON schema. +- Reports total files, rows, and errors found. + +**Output Example:** +```text +File: META/Patient.ndjson + Rows validated: 15 + Errors found: 0 +--- +Overall Totals + Files validated: 1 + Rows validated: 15 + Errors: 0 +``` + +## Validate Edge + +Checks for integrity issues in the graph data, specifically looking for "orphaned edges"—relationships that point to non-existent vertices. + +```bash +forge validate edge [flags] +``` + +**Flags:** +- `--path`, `-p`: Path to metadata files directory (default: `META`). +- `--out-dir`, `-o`: Directory to save generated vertices and edges files (JSON). + +**Behavior:** +- Generates graph elements (vertices and edges) from the input NDJSON files. +- Verifies that every edge points to a valid destination vertex. +- Can optionally export the vertices and edges to disk. + +**Output Example:** +```text +File: META/Patient.ndjson + Rows processed: 15 + Vertices generated: 15 + Edges generated: 0 +--- +Orphaned Edges: 0 +Overall Totals: + Files processed: 1 + Rows processed: 15 + Vertices generated: 15 + Edges generated: 0 + Orphaned edges: 0 +``` + +## Validate Config + +Validates the explorer configuration file structure. + +```bash +forge validate config [flags] +``` + +**Flags:** +- `--path`, `-p`: Path to config file to validate (default: `CONFIG`). diff --git a/docs/tools/funnel/_releases.md b/docs/tools/funnel/_releases.md new file mode 100644 index 0000000..3c303ff --- /dev/null +++ b/docs/tools/funnel/_releases.md @@ -0,0 +1,7 @@ +| Asset | Download | +| --- | --- | +| funnel-darwin-amd64-v0.11.7.tar.gz | [Download](https://github.com/ohsu-comp-bio/funnel/releases/download/v0.11.7/funnel-darwin-amd64-v0.11.7.tar.gz) | +| funnel-darwin-arm64-v0.11.7.tar.gz | [Download](https://github.com/ohsu-comp-bio/funnel/releases/download/v0.11.7/funnel-darwin-arm64-v0.11.7.tar.gz) | +| funnel-linux-amd64-v0.11.7.tar.gz | [Download](https://github.com/ohsu-comp-bio/funnel/releases/download/v0.11.7/funnel-linux-amd64-v0.11.7.tar.gz) | +| funnel-linux-arm64-v0.11.7.tar.gz | [Download](https://github.com/ohsu-comp-bio/funnel/releases/download/v0.11.7/funnel-linux-arm64-v0.11.7.tar.gz) | +| funnel-v0.11.7-checksums.txt | [Download](https://github.com/ohsu-comp-bio/funnel/releases/download/v0.11.7/funnel-v0.11.7-checksums.txt) | diff --git a/docs/tools/funnel/docs.md b/docs/tools/funnel/docs.md new file mode 100644 index 0000000..fa2a781 --- /dev/null +++ b/docs/tools/funnel/docs.md @@ -0,0 +1,82 @@ +--- +title: Overview +menu: + main: + identifier: docs + weight: -1000 +--- + +# Overview + +Funnel makes distributed, batch processing easier by providing a simple task API and a set of +components which can easily adapted to a vareity of platforms. + +### Task + +A task defines a unit of work: metadata, input files to download, a sequence of Docker containers + commands to run, +output files to upload, state, and logs. The API allows you to create, get, list, and cancel tasks. + +Tasks are accessed via the `funnel task` command. There's an HTTP client in the [client package][clientpkg], +and a set of utilities and a gRPC client in the [proto/tes package][tespkg]. + +There's a lot more you can do with the task API. See the [tasks docs](/docs/tasks/) for more. + +### Server + +The server serves the task API, web dashboard, and optionally runs a task scheduler. +It serves both HTTP/JSON and gRPC/Protobuf. + +The server is accessible via the `funnel server` command and the [server package][serverpkg]. + +### Storage + +Storage provides access to file systems such as S3, Google Storage, and local filesystems. +Tasks define locations where files should be downloaded from and uploaded to. Workers handle +the downloading/uploading. + +See the [storage docs](/docs/storage/) for more information on configuring storage backends. +The storage clients are available in the [storage package][storagepkg]. + +### Worker + +A worker is reponsible for executing a task. There is one worker per task. A worker: + +- downloads the inputs +- runs the sequence of executors (usually via Docker) +- uploads the outputs + +Along the way, the worker writes logs to event streams and databases: + +- start/end time +- state changes (initializing, running, error, etc) +- executor start/end times +- executor exit codes +- executor stdout/err logs +- a list of output files uploaded, with sizes +- system logs, such as host name, docker command, system error messages, etc. + +The worker is accessible via the `funnel worker` command and the [worker package][workerpkg]. + +### Node Scheduler + +A node is a service that stays online and manages a pool of task workers. A Funnel cluster +runs a node on each VM. Nodes communicate with a Funnel scheduler, which assigns tasks +to nodes based on available resources. Nodes handle starting workers when for each assigned +task. + +Nodes aren't always required. In some cases it often makes sense to rely on an existing, +external system for scheduling tasks and managing cluster resources, such as AWS Batch +or HPC systems like HTCondor, Slurm, Grid Engine, etc. Funnel provides integration with +these services that doesn't include nodes or scheduling by Funnel. + +See [Deploying a cluster](/docs/compute/deployment/) for more information about running a cluster of nodes. + +The node is accessible via the `funnel node` command and the [scheduler package][schedpkg]. + +[tes]: https://github.com/ga4gh/task-execution-schemas +[serverpkg]: https://github.com/ohsu-comp-bio/funnel/tree/master/server +[workerpkg]: https://github.com/ohsu-comp-bio/funnel/tree/master/worker +[schedpkg]: https://github.com/ohsu-comp-bio/funnel/tree/master/compute/scheduler +[clientpkg]: https://github.com/ohsu-comp-bio/funnel/tree/master/client +[tespkg]: https://github.com/ohsu-comp-bio/funnel/tree/master/proto/tes +[storagepkg]: https://github.com/ohsu-comp-bio/funnel/tree/master/storage diff --git a/docs/tools/funnel/docs/compute/aws-batch.md b/docs/tools/funnel/docs/compute/aws-batch.md new file mode 100644 index 0000000..bebc256 --- /dev/null +++ b/docs/tools/funnel/docs/compute/aws-batch.md @@ -0,0 +1,100 @@ +--- +title: AWS Batch +menu: + main: + parent: Compute + weight: 20 +--- + +# AWS Batch + +This guide covers deploying a Funnel server that leverages [DynamoDB][0] for storage +and [AWS Batch][1] for task execution. + +## Setup + +Get started by creating a compute environment, job queue and job definition using either +the Funnel CLI or the AWS Batch web console. To manage the permissions of instanced +AWS Batch jobs create a new IAM role. For the Funnel configuration outlined +in this document, this role will need to provide read and write access to both S3 and DynamoDB. + +_Note_: We recommend creating the Job Definition with Funnel by running: `funnel aws batch create-job-definition`. +Funnel expects the JobDefinition to start a Funnel worker process with a specific configuration. +Only advanced users should consider making any substantial changes to this Job Definition. + +AWS Batch tasks, by default, launch the ECS Optimized AMI which includes +an 8GB volume for the operating system and a 22GB volume for Docker image and metadata +storage. The default Docker configuration allocates up to 10GB of this storage to +each container instance. [Read more about the default AMI][8]. Due to these limitations, we +recommend [creating a custom AMI][7]. Because AWS Batch has the same requirements for your +AMI as Amazon ECS, use the default Amazon ECS-optimized Amazon Linux AMI as a base and change it +to better suit your tasks. + +### Steps +* [Create a Compute Environment][3] +* (_Optional_) [Create a custom AMI][7] +* [Create a Job Queue][4] +* [Create an EC2ContainerTaskRole with policies for managing access to S3 and DynamoDB][5] +* [Create a Job Definition][6] + +For more information check out AWS Batch's [getting started guide][2]. + +### Quickstart + +``` +$ funnel aws batch create-all-resources --region us-west-2 + +``` + +This command will create a compute environment, job queue, IAM role and job definition. + +## Configuring the Funnel Server + +Below is an example configuration. Note that the `Key` +and `Secret` fields are left blank in the configuration of the components. This is because +Funnel will, by default, try to automatically load credentials from the environment. +Alternatively, you may explicitly set the credentials in the config. + +```YAML +Database: "dynamodb" +Compute: "aws-batch" +EventWriters: + - "log" + +Dynamodb: + TableBasename: "funnel" + Region: "us-west-2" + Key: "" + Secret: "" + +Batch: + JobDefinition: "funnel-job-def" + JobQueue: "funnel-job-queue" + Region: "us-west-2" + Key: "" + Secret: "" + +AmazonS3: + Key: "" + Secret: "" +``` + +### Start the server + +```sh +funnel server run --config /path/to/config.yaml +``` + +### Known issues + +The `Task.Resources.DiskGb` field does not have any effect. See [issue 317](https://github.com/ohsu-comp-bio/funnel/issues/317). + +[0]: http://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Introduction.html +[1]: http://docs.aws.amazon.com/batch/latest/userguide/what-is-batch.html +[2]: http://docs.aws.amazon.com/batch/latest/userguide/Batch_GetStarted.html +[3]: https://us-west-2.console.aws.amazon.com/batch/home?region=us-west-2#/compute-environments/new +[4]: https://us-west-2.console.aws.amazon.com/batch/home?region=us-west-2#/queues/new +[5]: https://console.aws.amazon.com/iam/home?region=us-west-2#/roles$new?step=permissions&selectedService=EC2ContainerService&selectedUseCase=EC2ContainerTaskRole +[6]: https://us-west-2.console.aws.amazon.com/batch/home?region=us-west-2#/job-definitions/new +[7]: http://docs.aws.amazon.com/batch/latest/userguide/create-batch-ami.html +[8]: http://docs.aws.amazon.com/AmazonECS/latest/developerguide/ecs-optimized_AMI.html diff --git a/docs/tools/funnel/docs/compute/deployment.md b/docs/tools/funnel/docs/compute/deployment.md new file mode 100644 index 0000000..2ea266f --- /dev/null +++ b/docs/tools/funnel/docs/compute/deployment.md @@ -0,0 +1,79 @@ +--- +title: Deploying a cluster +menu: + main: + parent: Compute + weight: -50 +--- + +# Deploying a cluster + +This guide describes the basics of starting a cluster of Funnel nodes. +This guide is a work in progress. + +A node is a service +which runs on each machine in a cluster. The node connects to the Funnel server and reports +available resources. The Funnel scheduler process assigns tasks to nodes. When a task is +assigned, a node will start a worker process. There is one worker process per task. + +Nodes aren't always required. In some cases it makes sense to rely on an existing, +external system for scheduling tasks and managing cluster resources, such as AWS Batch, +HTCondor, Slurm, Grid Engine, etc. Funnel provides integration with +these services without using nodes or the scheduler. + +### Usage + +Nodes are available via the `funnel node` command. To start a node, run +```sh +funnel node run --config node.config.yml +``` + +To activate the Funnel scheduler, use the `manual` backend in the config. + +The available scheduler and node config: +```yaml +# Activate the Funnel scheduler. +Compute: manual + +Scheduler: + # How often to run a scheduler iteration. + ScheduleRate: 1s + + # How many tasks to schedule in one iteration. + ScheduleChunk: 10 + + # How long to wait between updates before marking a node dead. + NodePingTimeout: 1m + + # How long to wait for a node to start, before marking the node dead. + NodeInitTimeout: 5m + + +Node: + # If empty, a node ID will be automatically generated using the hostname. + ID: "" + + # If the node has been idle for longer than the timeout, it will shut down. + # -1 means there is no timeout. 0 means timeout immediately after the first task. + Timeout: -1s + + # A Node will automatically try to detect what resources are available to it. + # Defining Resources in the Node configuration overrides this behavior. + Resources: + # CPUs available. + # Cpus: 0 + # RAM available, in GB. + # RamGb: 0.0 + # Disk space available, in GB. + # DiskGb: 0.0 + + # For low-level tuning. + # How often to sync with the Funnel server. + UpdateRate: 5s + +Logger: + # Logging levels: debug, info, error + Level: info + # Write logs to this path. If empty, logs are written to stderr. + OutputFile: "" +``` diff --git a/docs/tools/funnel/docs/compute/grid-engine.md b/docs/tools/funnel/docs/compute/grid-engine.md new file mode 100644 index 0000000..d5b5921 --- /dev/null +++ b/docs/tools/funnel/docs/compute/grid-engine.md @@ -0,0 +1,57 @@ +--- +title: Grid Engine +--- +# Grid Engine + +Funnel can be configured to submit workers to [Grid Engine](https://gridscheduler.sourceforge.net/) by making calls +to `qsub`. + +The Funnel server needs to run on a submission node. +Configure Funnel to use Grid Engine by including the following config: + +It is recommended to update the submit file template so that the +`funnel worker run` command takes a config file as an argument: + +``` +{% raw %} +funnel worker run --config /opt/funnel_config.yml --taskID {{.TaskId}} +{% endraw %} +``` + +```YAML +{% raw %} +Compute: gridengine + +GridEngine: + Template: | + #!/bin/bash + #$ -N {{.TaskId}} + #$ -o {{.WorkDir}}/funnel-stdout + #$ -e {{.WorkDir}}/funnel-stderr + {{if ne .Cpus 0 -}} + {{printf "#$ -pe mpi %d" .Cpus}} + {{- end}} + {{if ne .RamGb 0.0 -}} + {{printf "#$ -l h_vmem=%.0fG" .RamGb}} + {{- end}} + {{if ne .DiskGb 0.0 -}} + {{printf "#$ -l h_fsize=%.0fG" .DiskGb}} + {{- end}} + funnel worker run --taskID {{.TaskId}} +{% endraw %} +``` + +The following variables are available for use in the template: + +| Variable | Description | +|:------------|:-------------| +|TaskId | funnel task id | +|WorkDir | funnel working directory | +|Cpus | requested cpu cores | +|RamGb | requested ram | +|DiskGb | requested free disk space | +|Zone | requested zone (could be used for queue name) | + +See https://golang.org/pkg/text/template for information on creating templates. + +[ge]: http://gridscheduler.sourceforge.net/documentation.html diff --git a/docs/tools/funnel/docs/compute/htcondor.md b/docs/tools/funnel/docs/compute/htcondor.md new file mode 100644 index 0000000..a6c8ebd --- /dev/null +++ b/docs/tools/funnel/docs/compute/htcondor.md @@ -0,0 +1,61 @@ +--- +title: HTCondor +menu: + main: + parent: Compute + weight: 20 +--- +# HTCondor + +Funnel can be configured to submit workers to [HTCondor][htcondor] by making +calls to `condor_submit`. + +The Funnel server needs to run on a submission node. +Configure Funnel to use HTCondor by including the following config: + +It is recommended to update the submit file template so that the +`funnel worker run` command takes a config file as an argument {% raw %} +(e.g. `funnel worker run --config /opt/funnel_config.yml --taskID {{.TaskId}}`){% endraw %} + +```YAML +{% raw %} +Compute: htcondor + +HTCondor: + Template: | + universe = vanilla + getenv = True + executable = funnel + arguments = worker run --taskID {{.TaskId}} + log = {{.WorkDir}}/condor-event-log + error = {{.WorkDir}}/funnel-stderr + output = {{.WorkDir}}/funnel-stdout + should_transfer_files = YES + when_to_transfer_output = ON_EXIT_OR_EVICT + {{if ne .Cpus 0 -}} + {{printf "request_cpus = %d" .Cpus}} + {{- end}} + {{if ne .RamGb 0.0 -}} + {{printf "request_memory = %.0f GB" .RamGb}} + {{- end}} + {{if ne .DiskGb 0.0 -}} + {{printf "request_disk = %.0f GB" .DiskGb}} + {{- end}} + + queue +{% endraw %} +``` +The following variables are available for use in the template: + +| Variable | Description | +|:------------|:-------------| +|TaskId | funnel task id | +|WorkDir | funnel working directory | +|Cpus | requested cpu cores | +|RamGb | requested ram | +|DiskGb | requested free disk space | +|Zone | requested zone (could be used for queue name) | + +See https://golang.org/pkg/text/template for information on creating templates. + +[htcondor]: https://research.cs.wisc.edu/htcondor/ diff --git a/docs/tools/funnel/docs/compute/kubernetes.md b/docs/tools/funnel/docs/compute/kubernetes.md new file mode 100644 index 0000000..4fdf748 --- /dev/null +++ b/docs/tools/funnel/docs/compute/kubernetes.md @@ -0,0 +1,121 @@ +--- +title: Kubernetes +menu: + main: + parent: Compute + weight: 20 +--- + +> Funnel on Kubernetes is in active development and may involve frequent updates + +# Quick Start + +## 1. Deploying with Helm + +```sh +helm repo add ohsu https://ohsu-comp-bio.github.io/helm-charts +helm repo update +helm upgrade --install ohsu funnel +``` + +## Alternative: Deploying with `kubectl` ⚙️" + +### 1. Create a Service: + +Deploy it: + +```sh +kubectl apply -f funnel-service.yml +``` + +### 2. Create Funnel config files + +> *[funnel-server.yaml](https://github.com/ohsu-comp-bio/funnel/blob/develop/deployments/kubernetes/funnel-server.yaml)* + +> *[funnel-worker.yaml](https://github.com/ohsu-comp-bio/funnel/blob/develop/deployments/kubernetes/funnel-worker.yaml)* + +Get the clusterIP: + +```sh +{% raw %} +export HOSTNAME=$(kubectl get services funnel --output=jsonpath='{.spec.clusterIP}') + +sed -i "s|\${HOSTNAME}|${HOSTNAME}|g" funnel-worker.yaml +{% endraw %} +``` + +### 3. Create a ConfigMap + +```sh +kubectl create configmap funnel-config --from-file=funnel-server.yaml --from-file=funnel-worker.yaml +``` + +### 4. Create a Service Account for Funnel + +Define a Role and RoleBinding: + +> *[role.yml](https://github.com/ohsu-comp-bio/funnel/blob/develop/deployments/kubernetes/role.yml)* + +> *[role_binding.yml](https://github.com/ohsu-comp-bio/funnel/blob/develop/deployments/kubernetes/role_binding.yml)* + +```sh +kubectl create serviceaccount funnel-sa --namespace default +kubectl apply -f role.yml +kubectl apply -f role_binding.yml +``` + +### 5. Create a Persistent Volume Claim + +> *[funnel-storage-pvc.yml](https://github.com/ohsu-comp-bio/funnel/blob/develop/deployments/kubernetes/funnel-storage-pvc.yml)* + +```sh +kubectl apply -f funnel-storage-pvc.yml +``` + +### 6. Create a Deployment + +> *[funnel-deployment.yml](https://github.com/ohsu-comp-bio/funnel/blob/develop/deployments/kubernetes/funnel-deployment.yml)* + +```sh +kubectl apply -f funnel-deployment.yml +``` + +{% raw %}{{< /details >}}{% endraw %} + +# 2. Proxy the Service for local testing + +```sh +kubectl port-forward service/funnel 8000:8000 +``` + +Now the funnel server can be accessed as if it were running locally. This can be verified by listing all tasks, which will return an empty JSON list: + +```sh +funnel task list +# {} +``` + +A task can then be submitted following the [standard workflow](../../tasks): + +```sh +funnel examples hello-world > hello-world.json + +funnel task create hello-world.json +# +``` + +# Storage Architecture + + + + + +# Additional Resources 📚 + +- [Helm Repo](https://ohsu-comp-bio.github.io/helm-charts) + +- [Helm Repo Source](https://github.com/ohsu-comp-bio/helm-charts) + +- [Helm Charts](https://github.com/ohsu-comp-bio/funnel/tree/develop/deployments/kubernetes/helm) + +- [The Chart Best Practices Guide](https://helm.sh/docs/chart_best_practices/) diff --git a/docs/tools/funnel/docs/compute/pbs-torque.md b/docs/tools/funnel/docs/compute/pbs-torque.md new file mode 100644 index 0000000..e548b3d --- /dev/null +++ b/docs/tools/funnel/docs/compute/pbs-torque.md @@ -0,0 +1,57 @@ +--- +title: PBS/Torque +render_macros: true +menu: + main: + parent: Compute + weight: 20 +--- +# PBS/Torque + +Funnel can be configured to submit workers to [PBS/Torque][pbs] by making calls +to `qsub`. + +The Funnel server needs to run on a submission node. +Configure Funnel to use PBS by including the following config: + +It is recommended to update the submit file template so that the +`funnel worker run` command takes a config file as an argument +(e.g. `funnel worker run --config /opt/funnel_config.yml --taskID {% raw %}{{.TaskId}}{% endraw %}`) + +{% raw %} +```YAML +Compute: pbs + +PBS: + Template: | + #!/bin/bash + #PBS -N {{.TaskId}} + #PBS -o {{.WorkDir}}/funnel-stdout + #PBS -e {{.WorkDir}}/funnel-stderr + {{if ne .Cpus 0 -}} + {{printf "#PBS -l nodes=1:ppn=%d" .Cpus}} + {{- end}} + {{if ne .RamGb 0.0 -}} + {{printf "#PBS -l mem=%.0fgb" .RamGb}} + {{- end}} + {{if ne .DiskGb 0.0 -}} + {{printf "#PBS -l file=%.0fgb" .DiskGb}} + {{- end}} + + funnel worker run --taskID {{.TaskId}} +``` +{% endraw %} +The following variables are available for use in the template: + +| Variable | Description | +|:------------|:-------------| +|TaskId | funnel task id | +|WorkDir | funnel working directory | +|Cpus | requested cpu cores | +|RamGb | requested ram | +|DiskGb | requested free disk space | +|Zone | requested zone (could be used for queue name) | + +See https://golang.org/pkg/text/template for information on creating templates. + +[pbs]: http://www.adaptivecomputing.com/products/open-source/torque/ diff --git a/docs/tools/funnel/docs/compute/slurm.md b/docs/tools/funnel/docs/compute/slurm.md new file mode 100644 index 0000000..98f1697 --- /dev/null +++ b/docs/tools/funnel/docs/compute/slurm.md @@ -0,0 +1,57 @@ +--- +title: Slurm +menu: + main: + parent: Compute + weight: 20 +--- +# Slurm + +Funnel can be configured to submit workers to [Slurm][slurm] by making calls +to `sbatch`. + +The Funnel server needs to run on a submission node. +Configure Funnel to use Slurm by including the following config: + +It is recommended to update the submit file template so that the +`funnel worker run` command takes a config file as an argument +(e.g. `funnel worker run --config /opt/funnel_config.yml --taskID {% raw %}{{.TaskId}}{% endraw %}`) + +{% raw %} +```YAML +Compute: slurm + +Slurm: + Template: | + #!/bin/bash + #SBATCH --job-name {{.TaskId}} + #SBATCH --ntasks 1 + #SBATCH --error {{.WorkDir}}/funnel-stderr + #SBATCH --output {{.WorkDir}}/funnel-stdout + {{if ne .Cpus 0 -}} + {{printf "#SBATCH --cpus-per-task %d" .Cpus}} + {{- end}} + {{if ne .RamGb 0.0 -}} + {{printf "#SBATCH --mem %.0fGB" .RamGb}} + {{- end}} + {{if ne .DiskGb 0.0 -}} + {{printf "#SBATCH --tmp %.0fGB" .DiskGb}} + {{- end}} + + funnel worker run --taskID {{.TaskId}} +``` +{% endraw %} +The following variables are available for use in the template: + +| Variable | Description | +|:------------|:-------------| +|TaskId | funnel task id | +|WorkDir | funnel working directory | +|Cpus | requested cpu cores | +|RamGb | requested ram | +|DiskGb | requested free disk space | +|Zone | requested zone (could be used for queue name) | + +See https://golang.org/pkg/text/template for information on creating templates. + +[slurm]: https://slurm.schedmd.com/ diff --git a/docs/tools/funnel/docs/databases.md b/docs/tools/funnel/docs/databases.md new file mode 100644 index 0000000..5eeb638 --- /dev/null +++ b/docs/tools/funnel/docs/databases.md @@ -0,0 +1,8 @@ +--- +title: Databases +menu: + main: + weight: 5 +--- + +# Databases diff --git a/docs/tools/funnel/docs/databases/boltdb.md b/docs/tools/funnel/docs/databases/boltdb.md new file mode 100644 index 0000000..ea5885e --- /dev/null +++ b/docs/tools/funnel/docs/databases/boltdb.md @@ -0,0 +1,24 @@ +--- +title: Embedded +menu: + main: + parent: Databases + weight: -10 +--- + +# Embedded + +By default, Funnel uses an embedded database named [BoltDB][bolt] to store task +and scheduler data. This is great for development and a simple server without +external dependencies, but it doesn't scale well to larger clusters. + +Available config: +```yaml +Database: boltdb + +BoltDB: + # Path to database file + Path: ./funnel-work-dir/funnel.db +``` + +[bolt]: https://github.com/boltdb/bolt diff --git a/docs/tools/funnel/docs/databases/datastore.md b/docs/tools/funnel/docs/databases/datastore.md new file mode 100644 index 0000000..ea31d8c --- /dev/null +++ b/docs/tools/funnel/docs/databases/datastore.md @@ -0,0 +1,94 @@ +--- +title: Datastore +menu: + main: + parent: Databases +--- + +# Google Cloud Datastore + +Funnel supports storing tasks (but not scheduler data) in Google Cloud Datastore. + +This implementation currently doesn't work with Appengine, since Appengine places +special requirements on the context of requests and requires a separate library. + +Two entity types are used, "Task" and "TaskPart" (for larger pieces of task content, +such as stdout/err logs). + +Funnel will, by default, try to automatically load credentials from the +environment. Alternatively, you may explicitly set the credentials in the config. +You can read more about providing the credentials +[here](https://cloud.google.com/docs/authentication/application-default-credentials). + +Config: +```yaml +Database: datastore + +Datastore: + Project: "" + # Path to account credentials file. + # Optional. If possible, credentials will be automatically discovered + # from the environment. + CredentialsFile: "" +``` + +Please also import some [composite +indexes](https://cloud.google.com/datastore/docs/concepts/indexes?hl=en) +to support the task-list queries. +This is typically done through command-line by referencing an **index.yaml** +file (do not change the filename) with the following content: + +```shell +gcloud datastore indexes create path/to/index.yaml --database='funnel' +``` + +```yaml +indexes: + +- kind: Task + properties: + - name: Owner + - name: State + - name: TagStrings + - name: CreationTime + direction: desc + +- kind: Task + properties: + - name: Owner + - name: State + - name: CreationTime + direction: desc + +- kind: Task + properties: + - name: Owner + - name: TagStrings + - name: CreationTime + direction: desc + +- kind: Task + properties: + - name: Owner + - name: CreationTime + direction: desc + +- kind: Task + properties: + - name: State + - name: TagStrings + - name: CreationTime + direction: desc + +- kind: Task + properties: + - name: State + - name: CreationTime + direction: desc + +- kind: Task + properties: + - name: TagStrings + - name: CreationTime + direction: desc +``` \ No newline at end of file diff --git a/docs/tools/funnel/docs/databases/dynamodb.md b/docs/tools/funnel/docs/databases/dynamodb.md new file mode 100644 index 0000000..3e536c2 --- /dev/null +++ b/docs/tools/funnel/docs/databases/dynamodb.md @@ -0,0 +1,30 @@ +--- +title: DynamoDB +menu: + main: + parent: Databases +--- + +# DynamoDB + +Funnel supports storing task data in DynamoDB. Storing scheduler data is not supported currently, so using the node scheduler with DynamoDB won't work. Using AWS Batch for compute scheduling may be a better option. +Funnel will, by default, try to automatically load credentials from the environment. Alternatively, you may explicitly set the credentials in the config. + +Available Config: +```yaml +Database: dynamodb + +DynamoDB: + # Basename to use for dynamodb tables + TableBasename: "funnel" + # AWS region + Region: "us-west-2" + # AWS Access key ID + Key: "" + # AWS Secret Access Key + Secret: "" +``` + +### Known issues + +Dynamo does not store scheduler data. See [issue 340](https://github.com/ohsu-comp-bio/funnel/issues/340). diff --git a/docs/tools/funnel/docs/databases/elasticsearch.md b/docs/tools/funnel/docs/databases/elasticsearch.md new file mode 100644 index 0000000..e397348 --- /dev/null +++ b/docs/tools/funnel/docs/databases/elasticsearch.md @@ -0,0 +1,30 @@ +--- +title: Elasticsearch +menu: + main: + parent: Databases +--- + +# Elasticsearch + +Funnel supports storing tasks and scheduler data in Elasticsearch (v8). + +Config: +```yaml +Database: elastic + +Elastic: + # Prefix to use for indexes + IndexPrefix: "funnel" + URL: http://localhost:9200 + # Optional. Username for HTTP Basic Authentication. + Username: + # Optional. Password for HTTP Basic Authentication. + Password: + # Optional. Endpoint for the Elastic Service (https://elastic.co/cloud). + CloudID: + # Optional. Base64-encoded token for authorization; if set, overrides username/password and service token. + APIKey: + # Optional. Service token for authorization; if set, overrides username/password. + ServiceToken: +``` diff --git a/docs/tools/funnel/docs/databases/mongodb.md b/docs/tools/funnel/docs/databases/mongodb.md new file mode 100644 index 0000000..4a6e8ab --- /dev/null +++ b/docs/tools/funnel/docs/databases/mongodb.md @@ -0,0 +1,24 @@ +--- +title: MongoDB +menu: + main: + parent: Databases +--- + +# MongoDB + +Funnel supports storing tasks and scheduler data in MongoDB. + +Config: +```yaml +Database: mongodb + +MongoDB: + # Addresses for the seed servers. + Addrs: + - "localhost" + # Database name used within MongoDB to store funnel data. + Database: "funnel" + Username: "" + Password: "" +``` diff --git a/docs/tools/funnel/docs/development.md b/docs/tools/funnel/docs/development.md new file mode 100644 index 0000000..1412f0d --- /dev/null +++ b/docs/tools/funnel/docs/development.md @@ -0,0 +1,8 @@ +--- +title: Development +menu: + main: + weight: 30 +--- + +# Development diff --git a/docs/tools/funnel/docs/development/developers.md b/docs/tools/funnel/docs/development/developers.md new file mode 100644 index 0000000..2d19f0a --- /dev/null +++ b/docs/tools/funnel/docs/development/developers.md @@ -0,0 +1,97 @@ +--- +title: Funnel Developers + +menu: + main: + parent: Development + weight: 30 +--- + +# Developers + +This page contains a rough collection of notes for people wanting to build Funnel from source and/or edit the code. + +### Building the Funnel source + +1. Install [Go 1.21+][go]. Check the version with `go version`. +2. Ensure GOPATH is set. See [the docs][gopath] for help. Also, you probably want to add `$GOPATH/bin` to your `PATH`. +3. Clone funnel and build + + ```shell + $ git clone https://github.com/ohsu-comp-bio/funnel.git + $ cd funnel + $ make + ``` + +4. Funnel is now downloaded and installed. Try `funnel version`. +5. You can edit the code and run `make install` to recompile. + +### Developer Tools + +A Funnel development environment includes: + +- [Go 1.21+][go] for the majority of the code. +- [Task Execution Schemas][tes] for task APIs. +- [Protobuf][protobuf] + [gRPC][grpc] for RPC communication. +- [gRPC Gateway][gateway] for HTTP communication. +- [Angular][angular] and [SASS][sass] for the web dashboard. +- [GNU Make][make] for development tasks. +- [Docker][docker] for executing task containers (tested with v1.12, v1.13). +- [dep][dep] for Go dependency vendoring. +- [Make][make] for development/build commands. +- [NodeJS][node] and [NPM][npm] for web dashboard development. + +### Makefile + +Most development tasks are run through `make` commands, including build, release, testing, website docs, lint, tidy, webdash dev, and more. See the [Makefile](https://github.com/ohsu-comp-bio/funnel/blob/master/Makefile) for an up-to-date list of commands. + +### Go Tests + +Run all tests: `make test` +Run the worker tests: `go test ./worker/...` +Run the worker tests with "Cancel" in the name: `go test ./worker -run Cancel` + +You get the idea. See the `go test` docs for more. + +### Mocking + +The [testify][testify] and [mockery][mockery] tools are used to generate and use +mock interfaces in test code, for example, to mock the Google Cloud APIs. + +[go]: https://golang.org +[angular]: https://angularjs.org/ +[protobuf]: https://github.com/google/protobuf +[grpc]: http://www.grpc.io/ +[sass]: http://sass-lang.com/ +[make]: https://www.gnu.org/software/make/ +[docker]: https://docker.io +[python]: https://www.python.org/ +[dep]: https://golang.github.io/dep/ +[node]: https://nodejs.org +[npm]: https://www.npmjs.com/ +[gateway]: https://github.com/grpc-ecosystem/grpc-gateway +[tes]: https://github.com/ga4gh/task-execution-schemas +[testify]: https://github.com/stretchr/testify +[mockery]: https://github.com/vektra/mockery +[gopath]: https://golang.org/doc/code.html#GOPATH + +### Making a release + +- Update Makefile, edit `FUNNEL_VERSION` and `LAST_PR_NUMBER` + - `LAST_PR_NUMBER` can be found by looking at the previous release notes + from the previous release. +- Run `make website`, which updates the download links and other content. + - Check the website locally by running `make website-dev` +- Commit these changes. + - Because goreleaser requires a clean working tree in git + - This is a special case where it's easiest to commit to master. +- Create a git tag: `git tag X.Y.Z` +- Run `make release` + - This will build cross-platform binaries, build release notes, + and draft an unpublished GitHub release. + - Check the built artifacts by downloading the tarballs from the GitHub draft release + and running `funnel version`. +- `git push origin master` to push your website and release changes. +- A tagged docker image for the release will be built automatically on [dockerhub](https://hub.docker.com/repository/docker/quay.io/ohsu-comp-bio/funnel). +- Publish the draft release on GitHub. +- Copy `build/release/funnel.rb` to the `ohsu-comp-bio/homebrew-formula/Formula/funnel.rb` Homebrew formula repo, and push those changes to master. diff --git a/docs/tools/funnel/docs/events.md b/docs/tools/funnel/docs/events.md new file mode 100644 index 0000000..87941ef --- /dev/null +++ b/docs/tools/funnel/docs/events.md @@ -0,0 +1,7 @@ +--- +title: Events +menu: + main: + weight: 5 +--- +# Events diff --git a/docs/tools/funnel/docs/events/kafka.md b/docs/tools/funnel/docs/events/kafka.md new file mode 100644 index 0000000..242ce93 --- /dev/null +++ b/docs/tools/funnel/docs/events/kafka.md @@ -0,0 +1,22 @@ +--- +title: Kafka +menu: + main: + parent: Events +--- + +# Kafka + +Funnel supports writing task events to a Kafka topic. To use this, add an event +writer to the config: + +```yaml +EventWriters: + - kafka + - log + +Kafka: + Servers: + - localhost:9092 + Topic: funnel-events +``` diff --git a/docs/tools/funnel/docs/integrations/nextflow.md b/docs/tools/funnel/docs/integrations/nextflow.md new file mode 100644 index 0000000..3090a94 --- /dev/null +++ b/docs/tools/funnel/docs/integrations/nextflow.md @@ -0,0 +1,100 @@ +--- +title: Nextflow +menu: + main: + parent: Integrations +--- + +> ⚠️ Nextflow support is currently in development and requires a few additional steps to run which are included below. + +# Nextflow + +[Nextflow](https://nextflow.io/) is a workflow engine with a [rich ecosystem]() of pipelines centered around biological analysis. + +> Nextflow enables scalable and reproducible scientific workflows using software containers. It allows the adaptation of pipelines written in the most common scripting languages. + +> Its fluent DSL simplifies the implementation and the deployment of complex parallel and reactive workflows on clouds and clusters. + +Since Nextflow [includes support](https://www.nextflow.io/docs/latest/executor.html#ga4gh-tes) for the TES API, it can be used in conjunction with Funnel to run tasks or to interact with a common TES endpoint. + +## Getting Started + +To set up Nextflow to use Funnel as the TES executor, run the following steps: + +### 1. Install Nextflow + +*Adapted from the [Nextflow Documentation](https://nextflow.io/docs/latest/install.html)* + +#### a. Install Nextflow: + +```sh +curl -s https://get.nextflow.io | bash +``` + +This will create the nextflow executable in the current directory. + +#### b. Make Nextflow executable: + +```sh +chmod +x nextflow +``` + +#### c. Move Nextflow into an executable path: + +```sh +sudo mv nextflow /usr/local/bin +``` + +#### d. Confirm that Nextflow is installed correctly: + +```sh +nextflow info +``` + +### 2. Update Nextflow Config + +Add the following to your `nextflow.config` in order to use the GA4GH TES plugin: + +```yaml +cat <> nextflow.config +plugins { + id 'nf-ga4gh' +} + +process.executor = 'tes' +tes.endpoint = 'http://localhost:8000' # <--- Funnel's default address +EOF +``` + +### 3. Start the Funnel Server + +Start the Funnel server: + +```sh +funnel server run +``` + +### 4. Run Nextflow + +In another window, run the workflow: + +```sh +nextflow run main.nf -c nextflow.config +``` + +## Additional Resources + +- [Nextflow Homepage](https://nextflow.io/) + +- [Nextflow Documentation](https://www.nextflow.io/docs) + +- [Nextflow's TES Support](https://www.nextflow.io/docs/latest/executor.html#ga4gh-tes) + +- [nf-core](https://nf-co.re/) + > A community effort to collect a curated set of analysis pipelines built using Nextflow. + +- [nf-canary](https://github.com/seqeralabs/nf-canary) + > A minimal Nextflow workflow for testing infrastructure. + +- [Nextflow Patterns](https://nextflow-io.github.io/patterns/) + > A curated collection of Nextflow implementation patterns diff --git a/docs/tools/funnel/docs/integrations/py-tes.md b/docs/tools/funnel/docs/integrations/py-tes.md new file mode 100644 index 0000000..7b12061 --- /dev/null +++ b/docs/tools/funnel/docs/integrations/py-tes.md @@ -0,0 +1,50 @@ +--- +title: py-tes +menu: + main: + parent: Integrations +--- + +> ⚠️ py-tes support is in active development and may be subject to change. + +# py-tes + +[py-tes](https://github.com/ohsu-comp-bio/py-tes) is a library for interacting with servers implementing the [GA4GH Task Execution Schema](https://github.com/ga4gh/task-execution-schemas). + +## Getting Started + +### Install + +Available on [PyPI](https://pypi.org/project/py-tes/). + +```sh +pip install py-tes +``` + +### Example Python Script + +```py +import tes + +task = tes.Task( + executors=[ + tes.Executor( + image="alpine", + command=["echo", "hello"] + ) + ] +) + +cli = tes.HTTPClient("http://funnel.example.com", timeout=5) +task_id = cli.create_task(task) +res = cli.get_task(task_id) +cli.cancel_task(task_id) +``` + +## Additional Resources + +- [py-tes Homepage](https://github.com/ohsu-comp-bio/py-tes) + +- [py-tes Documentation](https://ohsu-comp-bio.github.io/py-tes/) + +- [py-tes on PyPi](https://pypi.org/project/py-tes/) diff --git a/docs/tools/funnel/docs/metrics.md b/docs/tools/funnel/docs/metrics.md new file mode 100644 index 0000000..1077112 --- /dev/null +++ b/docs/tools/funnel/docs/metrics.md @@ -0,0 +1,8 @@ +--- +title: Metrics +menu: + main: + identifier: Metrics + weight: 6 +--- +# Metrics diff --git a/docs/tools/funnel/docs/metrics/prometheus.md b/docs/tools/funnel/docs/metrics/prometheus.md new file mode 100644 index 0000000..1b3495b --- /dev/null +++ b/docs/tools/funnel/docs/metrics/prometheus.md @@ -0,0 +1,36 @@ +--- +title: Prometheus +menu: + main: + parent: Metrics +--- + +# Prometheus + +[Prometheus][prom] is a monitoring and metrics collection service. It pulls metrics +from various "exporters", collects them in a time-series database, provides +a query langauge for access that data, and integrates closely with tools +such as [Grafana][graf] for visualization and dashboard building. + +Funnel exports these metrics: + +- `funnel_tasks_state_count`: the number of tasks + in each state (queued, running, etc). +- `funnel_nodes_state_count`: the number of nodes + in each state (alive, dead, draining, etc). +- `funnel_nodes_total_cpus`: the total number + of CPUs available by all nodes. +- `funnel_nodes_total_ram_bytes`: the total number + of bytes of RAM available by all nodes. +- `funnel_nodes_total_disk_bytes`: the total number + of bytes of disk space available by all nodes. +- `funnel_nodes_available_cpus`: the available number + of CPUs available by all nodes. +- `funnel_nodes_available_ram_bytes`: the available number + of bytes of RAM available by all nodes. +- `funnel_nodes_available_disk_bytes`: the available number + of bytes of disk space available by all nodes. + +[prom]: https://prometheus.io/ +[gauge]: https://prometheus.io/docs/concepts/metric_types/#gauge +[graf]: https://grafana.com/ diff --git a/docs/tools/funnel/docs/security.md b/docs/tools/funnel/docs/security.md new file mode 100644 index 0000000..c3dba45 --- /dev/null +++ b/docs/tools/funnel/docs/security.md @@ -0,0 +1,8 @@ +--- +title: Security +menu: + main: + weight: 10 +--- + +# Security diff --git a/docs/tools/funnel/docs/security/advanced.md b/docs/tools/funnel/docs/security/advanced.md new file mode 100644 index 0000000..3864e34 --- /dev/null +++ b/docs/tools/funnel/docs/security/advanced.md @@ -0,0 +1,29 @@ +--- +title: Advanced Auth +menu: + main: + parent: Security + weight: 10 +--- + +# Overview 🔐 + +Thanks to our collaborators at CTDS — Funnel is currently adding support for "Per-User/Per-Bucket" credentials to allow Users to access S3 Buckets without having to store their credentials in the Funnel Server. + +The high level overview of this feature will be such Funnel will be able to speak with a custom credential "Wrapper Script" that will: + +- Take the User Credentials +- Create an S3 Bucket +- Generate a Key (optionally for use in Nextflow Config) +- Send the Key to Funnel + +In this way this Wrapper can manage the bucket and the keys (the Wrapper would be the middleware between the User and Funnel). + +Stay tuned for this feature's development! This feature is being tracked with the following: + +- GitHub Branch: https://github.com/ohsu-comp-bio/funnel/tree/feature/credentials +- Pull Request: https://github.com/ohsu-comp-bio/funnel/pull/1098 + +# Credits 🙌 + +This feature and its development would not be possible without our continuing collaboration with [Pauline Ribeyre](https://github.com/paulineribeyre), [Jawad Qureshi](https://github.com/jawadqur), [Michael Fitzsimons](https://www.linkedin.com/in/michael-fitzsimons-ab8a6111), and the entire [CTDS](https://ctds.uchicago.edu) team at the [University of Chicago](https://www.uchicago.edu/)! diff --git a/docs/tools/funnel/docs/security/basic.md b/docs/tools/funnel/docs/security/basic.md new file mode 100644 index 0000000..0b19e07 --- /dev/null +++ b/docs/tools/funnel/docs/security/basic.md @@ -0,0 +1,59 @@ +--- +title: Basic Auth +menu: + main: + parent: Security + weight: 10 +--- +# Basic Auth + +By default, a Funnel server allows open access to its API endpoints, but it +can be configured to require basic password authentication. To enable this, +include users and passwords in your config file: + +```yaml +Server: + BasicAuth: + - User: admin + Password: someReallyComplexSecret + Admin: true + - User: funnel + Password: abc123 + + TaskAccess: OwnerOrAdmin +``` + +The `TaskAccess` property configures the visibility and access-mode for tasks: + +* `All` (default) - all tasks are visible to everyone +* `Owner` - tasks are visible to the users who created them +* `OwnerOrAdmin` - extends `Owner` by allowing Admin-users (`Admin: true`) + access everything + +As new tasks are created, the username behind the request is recorded as the +owner of the task. Depending on the `TaskAccess` property, if owner-based +acces-mode is enabled, the owner of the task is compared to username of current +request to decide if the user may see and interact with the task. + +If you are using BoltDB or Badger, the Funnel worker communicates to the server via gRPC +so you will also need to configure the RPC client. + +```yaml +RPCClient: + User: funnel + Password: abc123 +``` + +Make sure to properly protect the configuration file so that it's not readable +by everyone: + +```bash +$ chmod 600 funnel.config.yml +``` + +To use the password, set the `FUNNEL_SERVER_USER` and `FUNNEL_SERVER_PASSWORD` environment variables: +```bash +$ export FUNNEL_SERVER_USER=funnel +$ export FUNNEL_SERVER_PASSWORD=abc123 +$ funnel task list +``` diff --git a/docs/tools/funnel/docs/security/oauth2.md b/docs/tools/funnel/docs/security/oauth2.md new file mode 100644 index 0000000..4b4232d --- /dev/null +++ b/docs/tools/funnel/docs/security/oauth2.md @@ -0,0 +1,74 @@ +--- +title: OAuth2 +menu: + main: + parent: Security + weight: 10 +--- +# OAuth2 + +By default, a Funnel server allows open access to its API endpoints, but in +addition to Basic authentication it can also be configured to require a valid +JWT in the request. + +Funnel itself does not redirect users to perform the login. +It just validates that the presented token is issued by a trusted service +(specified in the YAML configuration file) and the token has not expired. +In addition, if the OIDC provides a token introspection endpoint (in its +configuration JSON), Funnel server also calls that endpoint to make sure the +token is still active (i.e., no token invalidation before expiring). + +Optionally, Funnel can also validate the scope and audience claims to contain +specific values. + +To enable JWT authentication, specify `OidcAuth` section in your config file: + +```yaml +Server: + OidcAuth: + # URL of the OIDC service configuration: + ServiceConfigURL: "https://my.oidc.service/.well-known/openid-configuration" + + # Client ID and secret are sent with the token introspection request + # (Basic authentication): + ClientId: your-client-id + ClientSecret: your-client-secret + + # Optional: if specified, this scope value must be in the token: + RequireScope: funnel-id + + # Optional: if specified, this audience value must be in the token: + RequireAudience: tes-api + + # The URL where OIDC should redirect after login (keep the path '/login') + RedirectURL: "http://localhost:8000/login" + + # List of OIDC subjects promoted to Admin status. + Admins: + - user.one@example.org + - user.two@example.org + + TaskAccess: OwnerOrAdmin +``` + +The `TaskAccess` property configures the visibility and access-mode for tasks: + +* `All` (default) - all tasks are visible to everyone +* `Owner` - tasks are visible to the users who created them +* `OwnerOrAdmin` - extends `Owner` by allowing Admin-users (defined under + `Admins`) access everything + +As new tasks are created, the username behind the request is recorded as the +owner of the task. Depending on the `TaskAccess` property, if owner-based +acces-mode is enabled, the owner of the task is compared to username of current +request to decide if the user may see and interact with the task. + +Make sure to properly protect the configuration file so that it's not readable +by everyone: + +```bash +$ chmod 600 funnel.config.yml +``` + +Note that the Funnel UI supports login through an OIDC service. However, OIDC +authentication is not supported at command-line. diff --git a/docs/tools/funnel/docs/storage.md b/docs/tools/funnel/docs/storage.md new file mode 100644 index 0000000..9297161 --- /dev/null +++ b/docs/tools/funnel/docs/storage.md @@ -0,0 +1,8 @@ +--- +title: Storage +menu: + main: + identifier: Storage + weight: -10 +--- +# Storage diff --git a/docs/tools/funnel/docs/storage/ftp.md b/docs/tools/funnel/docs/storage/ftp.md new file mode 100644 index 0000000..79b2439 --- /dev/null +++ b/docs/tools/funnel/docs/storage/ftp.md @@ -0,0 +1,38 @@ +--- +title: FTP +menu: + main: + parent: Storage +--- + +# FTP + +Funnel supports download and uploading files via FTP. + +Currently authentication credentials are take from the URL, e.g. `ftp://username:password@ftp.host.tld`. This will be improved soon to allow credentials to be added to the configuration file. + +The FTP storage client is enabled by default, but may be explicitly disabled in the +worker config: + +```yaml +FTPStorage: + Disabled: false +``` + +### Example task +```json +{ + "name": "Hello world", + "inputs": [{ + "url": "ftp://my.ftpserver.xyz/hello.txt", + "path": "/inputs/hello.txt" + }, { + "url": "ftp://user:mypassword123@my.ftpserver.xyz/hello.txt", + "path": "/inputs/hello.txt" + }], + "executors": [{ + "image": "alpine", + "command": ["cat", "/inputs/hello.txt"], + }] +} +``` diff --git a/docs/tools/funnel/docs/storage/google-storage.md b/docs/tools/funnel/docs/storage/google-storage.md new file mode 100644 index 0000000..d8fde4f --- /dev/null +++ b/docs/tools/funnel/docs/storage/google-storage.md @@ -0,0 +1,43 @@ +--- +title: Google Storage +menu: + main: + parent: Storage +--- + +# Google Storage + +Funnel supports using [Google Storage][gs] (GS) for file storage. + +The Google storage client is enabled by default, and will try to automatically +load credentials from the environment. Alternatively, you +may explicitly set the credentials in the worker config: + +```yaml +GoogleStorage: + Disabled: false + # Path to account credentials file. + AccountFile: "" +``` + +### Example task +```json +{ + "name": "Hello world", + "inputs": [{ + "url": "gs://funnel-bucket/hello.txt", + "path": "/inputs/hello.txt" + }], + "outputs": [{ + "url": "gs://funnel-bucket/output.txt", + "path": "/outputs/hello-out.txt" + }], + "executors": [{ + "image": "alpine", + "command": ["cat", "/inputs/hello.txt"], + "stdout": "/outputs/hello-out.txt", + }] +} +``` + +[gs]: https://cloud.google.com/storage/ diff --git a/docs/tools/funnel/docs/storage/http.md b/docs/tools/funnel/docs/storage/http.md new file mode 100644 index 0000000..8192205 --- /dev/null +++ b/docs/tools/funnel/docs/storage/http.md @@ -0,0 +1,37 @@ +--- +title: HTTP(S) +menu: + main: + parent: Storage +--- + +# HTTP(S) + +Funnel supports downloading files from public URLs via GET requests. No authentication +mechanism is allowed. This backend can be used to fetch objects from cloud storage +providers exposed using presigned URLs. + +The HTTP storage client is enabled by default, but may be explicitly disabled in the +worker config: + +```yaml +HTTPStorage: + Disabled: false + # Timeout for http(s) GET requests. + Timeout: 30s +``` + +### Example task +```json +{ + "name": "Hello world", + "inputs": [{ + "url": "http://fakedomain.com/hello.txt", + "path": "/inputs/hello.txt" + }], + "executors": [{ + "image": "alpine", + "command": ["cat", "/inputs/hello.txt"], + }] +} +``` diff --git a/docs/tools/funnel/docs/storage/local.md b/docs/tools/funnel/docs/storage/local.md new file mode 100644 index 0000000..eb68669 --- /dev/null +++ b/docs/tools/funnel/docs/storage/local.md @@ -0,0 +1,63 @@ +--- +title: Local +menu: + main: + parent: Storage + weight: -10 +--- + +# Local + +Funnel supports using the local filesystem for file storage. + +Funnel limits which directories may be accessed, by default only allowing directories +under the current working directory of the Funnel worker. + +Config: +```yaml +LocalStorage: + # Whitelist of local directory paths which Funnel is allowed to access. + AllowedDirs: + - ./ + - /path/to/allowed/dir + - ...etc +``` + +### Example task + +Files must be absolute paths in `file:///path/to/file.txt` URL form. + +``` +{ + "name": "Hello world", + "inputs": [{ + "url": "file:///path/to/funnel-data/hello.txt", + "path": "/inputs/hello.txt" + }], + "outputs": [{ + "url": "file:///path/to/funnel-data/output.txt", + "path": "/outputs/hello-out.txt" + }], + "executors": [{ + "image": "alpine", + "command": ["cat", "/inputs/hello.txt"], + "stdout": "/outputs/hello-out.txt", + }] +} +``` + +### File hard linking behavior + +For efficiency, Funnel will attempt not to copy the input files, instead trying +create a hard link to the source file. In some cases this isn't possible. For example, +if the source file is on a network file system mount (e.g. NFS) but the Funnel worker's +working directory is on the local scratch disk, a hard link would cross a file system +boundary, which is not possible. In this case, Funnel will copy the file. + +### File ownership behavior + +One difficult area of files and Docker containers is file owner/group management. +If a Docker container runs as root, it's likely that the file will end up being owned +by root on the host system. In this case, some step (Funnel or another task) will +likely fail to access it. This is a tricky problem with no good solution yet. +See [issue 66](https://github.com/ohsu-comp-bio/funnel/issues/66). diff --git a/docs/tools/funnel/docs/storage/s3.md b/docs/tools/funnel/docs/storage/s3.md new file mode 100644 index 0000000..75a0271 --- /dev/null +++ b/docs/tools/funnel/docs/storage/s3.md @@ -0,0 +1,96 @@ +--- +title: S3 +menu: + main: + parent: Storage +--- + +# S3 + +## Amazon S3 + +Funnel supports using [AWS S3](https://aws.amazon.com/s3/) for file storage. + +The Amazon S3 storage client is enabled by default, and will try to automatically +load credentials from the environment. Alternatively, you +may explicitly set the credentials in the worker config: + +```yaml +AmazonS3: + Disabled: false + # The maximum number of times that a request will be retried for failures. + MaxRetries: 10 + Key: "" + Secret: "" +``` + +The Amazon S3 storage client also supports SSE-KMS and SSE-C configurations. + +For SSE-KMS as long as your credentials can access the KMS key used for the +given bucket, no special configuration is required. However, you can specifiy a +specific KMS key if desired: + +```yaml +AmazonS3: + SSE: + KMSKey: "1a03ce70-5f03-484e-8396-0e97de661b79" +``` + +For SSE-C: + +Generate a key file: + +```sh +openssl rand -out sse-c.key 32 +``` + +Then configure the storage client to use it: + +```yaml +AmazonS3: + SSE: + CustomerKeyFile: "./sse-c.key" +``` + +Note that this file will need to be available to all Funnel workers. + +## Other S3 API Providers + +Funnel also supports using non-Amazon S3 API providers ([Ceph][ceph], +[Cleversafe][cleversafe], [Minio][minio], etc.) for file storage. + +These other S3 storage clients are NOT enabled by default. You must configure them. + +This storage client also supports the [version 4 signing process](https://docs.aws.amazon.com/AmazonS3/latest/API/sig-v4-authenticating-requests.html). + +```yaml +GenericS3: + - Disabled: false + Endpoint: "" + Key: "" + Secret: "" +``` + +### Example task +```json +{ + "name": "Hello world", + "inputs": [{ + "url": "s3://funnel-bucket/hello.txt", + "path": "/inputs/hello.txt" + }], + "outputs": [{ + "url": "s3://funnel-bucket/output.txt", + "path": "/outputs/hello-out.txt" + }], + "executors": [{ + "image": "alpine", + "command": ["cat", "/inputs/hello.txt"], + "stdout": "/outputs/hello-out.txt" + }] +} +``` + +[ceph]: http://ceph.com/ +[cleversafe]: https://www.ibm.com/cloud/object-storage +[minio]: https://minio.io/ diff --git a/docs/tools/funnel/docs/storage/swift.md b/docs/tools/funnel/docs/storage/swift.md new file mode 100644 index 0000000..3de323a --- /dev/null +++ b/docs/tools/funnel/docs/storage/swift.md @@ -0,0 +1,53 @@ +--- +title: OpenStack Swift +menu: + main: + parent: Storage +--- + +# OpenStack Swift + +Funnel supports using [OpenStack Swift][swift] for file storage. + +The Swift storage client is enabled by default, and will try to automatically +load credentials from the environment. Alternatively, you +may explicitly set the credentials in the worker config: + +```yaml +Swift: + Disabled: false + UserName: "" + Password: "" + AuthURL: "" + TenantName: "" + TenantID: "" + RegionName: "" + # 500 MB + ChunkSizeBytes: 500000000 +``` + +### Example task +```json +{ + "name": "Hello world", + "inputs": [{ + "url": "swift://funnel-bucket/hello.txt", + "path": "/inputs/hello.txt" + }], + "outputs": [{ + "url": "swift://funnel-bucket/output.txt", + "path": "/outputs/hello-out.txt" + }], + "executors": [{ + "image": "alpine", + "command": ["cat", "/inputs/hello.txt"], + "stdout": "/outputs/hello-out.txt", + }] +} +``` + +### Known Issues: + +The config currently only supports OpenStack v2 auth. See [issue #336](https://github.com/ohsu-comp-bio/funnel/issues/336). + +[swift]: https://docs.openstack.org/swift/latest/ diff --git a/docs/tools/funnel/docs/tasks.md b/docs/tools/funnel/docs/tasks.md new file mode 100644 index 0000000..ed9d25f --- /dev/null +++ b/docs/tools/funnel/docs/tasks.md @@ -0,0 +1,494 @@ +--- +title: Tasks +menu: + main: + identifier: tasks + weight: -70 +--- + +# Tasks + +A task defines a unit of work: + +- metadata +- input files to download +- a sequence of Docker containers + commands to run, +- output files to upload +- state +- logs + +The example task below downloads a file named `hello.txt` from S3 and calls `cat hello.txt` using the [alpine][alpine] container. This task also writes the executor's stdout to a file, and uploads the stdout to s3. + +``` +{ + "name": "Hello world", + "inputs": [{ + # URL to download file from. + "url": "s3://funnel-bucket/hello.txt", + # Path to download file to. + "path": "/inputs/hello.txt" + }], + "outputs": [{ + # URL to upload file to. + "url": "s3://funnel-bucket/output.txt", + # Local path to upload file from. + "path": "/outputs/stdout" + }], + "executors": [{ + # Container image name. + "image": "alpine", + # Command to run (argv). + "command": ["cat", "/inputs/hello.txt"], + # Capture the stdout of the command to /outputs/stdout + "stdout": "/outputs/stdout" + }] +} +``` + +Tasks have multiple "executors"; containers and commands run in a sequence. +Funnel runs executors via Docker. + +Tasks also have state and logs: +``` +{ + "id": "b85khc2rl6qkqbhg8vig", + "state": "COMPLETE", + "name": "Hello world", + "inputs": [ + { + "url": "s3://funnel-bucket/hello.txt", + "path": "/inputs/hello.txt" + } + ], + "outputs": [ + { + "url": "s3://funnel-bucket/output.txt", + "path": "/outputs/stdout" + } + ], + "executors": [ + { + "image": "alpine", + "command": [ + "cat", + "/inputs/hello.txt" + ], + "stdout": "/outputs/stdout" + } + ], + "logs": [ + { + "logs": [ + { + "startTime": "2017-11-14T11:49:05.127885125-08:00", + "endTime": "2017-11-14T11:49:08.484461502-08:00", + "stdout": "Hello, Funnel!\n" + } + ], + "startTime": "2017-11-14T11:49:04.433593468-08:00", + "endTime": "2017-11-14T11:49:08.487707039-08:00" + } + ], + "creationTime": "2017-11-14T11:49:04.427163701-08:00" +} +``` + +There are logs for each task attempt and each executor. Notice that the stdout is +conveniently captured by `logs[0].logs[0].stdout`. + +### Task API + +The API lets you create, get, list, and cancel tasks. + +### Create +``` +POST /v1/tasks +{ + "name": "Hello world", + "inputs": [{ + "url": "s3://funnel-bucket/hello.txt", + "path": "/inputs/hello.txt" + }], + "outputs": [{ + "url": "s3://funnel-bucket/output.txt", + "path": "/outputs/stdout" + }], + "executors": [{ + "image": "alpine", + "command": ["cat", "/inputs/hello.txt"], + "stdout": "/outputs/stdout" + }] +} + + +# The response is a task ID: +b85khc2rl6qkqbhg8vig +``` + +### Get +``` +GET /v1/tasks/b85khc2rl6qkqbhg8vig + +{"id": "b85khc2rl6qkqbhg8vig", "state": "COMPLETE"} +``` + +By default, the minimal task view is returned which describes only the ID and state. +In order to get the original task with some basic logs, use the "BASIC" task view: +``` +GET /v1/tasks/b85khc2rl6qkqbhg8vig?view=BASIC +{ + "id": "b85khc2rl6qkqbhg8vig", + "state": "COMPLETE", + "name": "Hello world", + "inputs": [ + { + "url": "gs://funnel-bucket/hello.txt", + "path": "/inputs/hello.txt" + } + ], + "outputs": [ + { + "url": "s3://funnel-bucket/output.txt", + "path": "/outputs/stdout" + } + ], + "executors": [ + { + "image": "alpine", + "command": [ + "cat", + "/inputs/hello.txt" + ], + "stdout": "/outputs/stdout", + } + ], + "logs": [ + { + "logs": [ + { + "startTime": "2017-11-14T11:49:05.127885125-08:00", + "endTime": "2017-11-14T11:49:08.484461502-08:00", + } + ], + "startTime": "2017-11-14T11:49:04.433593468-08:00", + "endTime": "2017-11-14T11:49:08.487707039-08:00" + } + ], + "creationTime": "2017-11-14T11:49:04.427163701-08:00" +} +``` + +The "BASIC" doesn't include some fields such as stdout/err logs, because these fields may be potentially large. +In order to get everything, use the "FULL" view: +``` +GET /v1/tasks/b85khc2rl6qkqbhg8vig?view=FULL +{ + "id": "b85khc2rl6qkqbhg8vig", + "state": "COMPLETE", + "name": "Hello world", + "inputs": [ + { + "url": "gs://funnel-bucket/hello.txt", + "path": "/inputs/hello.txt" + } + ], + "executors": [ + { + "image": "alpine", + "command": [ + "cat", + "/inputs/hello.txt" + ], + "stdout": "/outputs/stdout", + } + ], + "logs": [ + { + "logs": [ + { + "startTime": "2017-11-14T11:49:05.127885125-08:00", + "endTime": "2017-11-14T11:49:08.484461502-08:00", + "stdout": "Hello, Funnel!\n" + } + ], + "startTime": "2017-11-14T11:49:04.433593468-08:00", + "endTime": "2017-11-14T11:49:08.487707039-08:00" + } + ], + "creationTime": "2017-11-14T11:49:04.427163701-08:00" +} +``` + +### List +``` +GET /v1/tasks +{ + "tasks": [ + { + "id": "b85l8tirl6qkqbhg8vj0", + "state": "COMPLETE" + }, + { + "id": "b85khc2rl6qkqbhg8vig", + "state": "COMPLETE" + }, + { + "id": "b85kgt2rl6qkpuptua70", + "state": "SYSTEM_ERROR" + }, + { + "id": "b857gnirl6qjfou61fh0", + "state": "SYSTEM_ERROR" + } + ] +} +``` + +List has the same task views as Get: MINIMAL, BASIC, and FULL. + +The task list is paginated: +``` +GET /v1/tasks?page_token=1h123h12j2h3k +{ + "next_page_token": "1n3n1j23k12n3k123", + "tasks": [ + { + "id": "b85l8tirl6qkqbhg8vj0", + "state": "COMPLETE" + }, + # ... more tasks here ... + ] +} +``` + +### Cancel + +Tasks cannot be modified by the user after creation, with one exception – they can be canceled. +``` +POST /v1/tasks/b85l8tirl6qkqbhg8vj0:cancel +``` + + +### Full task spec + +Here's a more detailed description of a task. +For a full, in-depth spec, read the TES standard's [task_execution.proto](https://github.com/ga4gh/task-execution-schemas/blob/master/task_execution.proto). + +``` +{ + # The task's ID. Set by the server. + # Output only. + "id": "1234567", + + # The task's state. Possible states: + # QUEUED + # INITILIZING + # RUNNING + # PAUSED + # COMPLETE + # EXECUTOR_ERROR + # SYSTEM_ERROR + # CANCELED + # + # Output only. + "state": "QUEUED", + + # Metadata + "name": "Task name.", + "description": "Task description.", + "tags": { + "custom-tag-1": "tag-value-1", + "custom-tag-2": "tag-value-2", + }, + + # Resource requests + "resources": { + # Number of CPU cores requested. + "cpuCores": 1, + + # RAM request, in gigabytes. + "ramGb": 1.0, + + # Disk space request, in gigabytes. + "diskGb": 100.0, + + # Request preemptible machines, + # e.g. preemptible VM in Google Cloud, an instance from the AWS Spot Market, etc. + "preemptible": false, + + # Request that the task run in these compute zones. + "zones": ["zone1", "zone2"], + }, + + # Input files will be downloaded by the worker. + # This example uses s3, but Funnel supports multiple filesystems. + "inputs": [ + { + "name": "Input file.", + "description": "Input file description.", + + # URL to download file from. + "url": "s3://my-bucket/object/path/file.txt", + # Path to download file to. + "path": "/container/input.txt" + }, + { + "name": "Input directory.", + "description": "Directories are also supported.", + "url": "s3://my-bucket/my-data/", + "path": "/inputs/my-data/", + "type": "DIRECTORY" + }, + + # A task may include the file content directly in the task message. + # This is sometimes useful for small files such as scripts, + # which you want to include without talking directly to the filesystem. + { + "path": "/inputs/script.py", + "content": "import socket; print socket.gethostname()" + } + ], + + # Output files will be uploaded to storage by the worker. + "outputs": [ + { + "name": "Output file.", + "description": "Output file description.", + "url": "s3://my-bucket/output-data/results.txt", + "path": "/outputs/results.txt" + }, + { + "name": "Output directory.", + "description": "Directories are also supported.", + "url": "s3://my-bucket/output-data/output-dir/", + "path": "/outputs/data-dir/", + "type": "DIRECTORY" + } + ], + + # Executors define a sequence of containers + commands to run. + # Execution stop on the first non-zero exit code. + "executors": [ + { + # Container image name. + # Funnel supports running executor containers via Docker. + "image": "ubuntu", + + # Command arguments (argv). + # The first item is the executable to run. + "command": ["my-tool-1", "/container/input"], + + # Local file path to read stdin from. + "stdin": "/inputs/stdin.txt", + + # Local file path to write stdout to. + "stdout": "/container/output", + + # Local file path to write stderr to. + "stderr": "/container/stderr", + + # Set the working directory before executing the command. + "workdir": "/data/workdir", + + # Environment variables + "env": { + "ENV1": "value1", + "ENV2": "value2", + } + }, + + # Second executor runs after the first completes, on the same machine. + { + "image": "ubuntu", + "command": ["cat", "/container/input"], + "stdout": "/container/output", + "stderr": "/container/stderr", + "workdir": "/tmp" + } + ] + + # Date/time the task was created. + # Set the the server. + # Output only. + "creationTime": "2017-11-14T11:49:04.427163701-08:00" + + # Task logs. + # Output only. + # + # If there's a system error, the task may be attempted multiple times, + # so this field is a list of attempts. In most cases, there will be only + # one or zero entries here. + "logs": [ + + # Attempt start/end times, in RFC3339 format. + "startTime": "2017-11-14T11:49:04.433593468-08:00", + "endTime": "2017-11-14T11:49:08.487707039-08:00" + + # Arbitrary metadata set by Funnel. + "metadata": { + "hostname": "worker-1", + }, + + # Arbitrary system logs which Funnel thinks are useful to the user. + "systemLogs": [ + "task was assigned to worker 1", + "docker command: docker run -v /vol:/data alpine cmd arg1 arg2", + ], + + # Log of files uploaded to storage by the worker, + # including all files in directories, with file sizes. + "outputs": [ + { + "url": "s3://my-bucket/output-data/results.txt", + "path": "/outputs/results.txt", + "sizeBytes": 123 + }, + { + "url": "s3://my-bucket/output-data/output-dir/file1.txt", + "path": "/outputs/data-dir/file1.txt", + "sizeBytes": 123 + }, + { + "url": "s3://my-bucket/output-data/output-dir/file2.txt", + "path": "/outputs/data-dir/file2.txt", + "sizeBytes": 123 + } + { + "url": "s3://my-bucket/output-data/output-dir/subdir/file3.txt", + "path": "/outputs/data-dir/subdir/file3.txt", + "sizeBytes": 123 + } + ], + + # Executor logs. One entry per executor. + "logs": [ + { + # Executor start/end time, in RFC3339 format. + "startTime": "2017-11-14T11:49:05.127885125-08:00", + "endTime": "2017-11-14T11:49:08.484461502-08:00", + + # Executor stdout/err. Only available in the FULL task view. + # + # There is a size limit for these fields, which is configurable + # and defaults to 10KB. If more than 10KB is generated, only the + # tail will be logged. If the full output is needed, the task + # may use Executor.stdout and an output to upload the full content + # to storage. + "stdout": "Hello, Funnel!", + "stderr": "", + + # Exit code + "exit_code": 0, + }, + { + "startTime": "2017-11-14T11:49:05.127885125-08:00", + "endTime": "2017-11-14T11:49:08.484461502-08:00", + "stdout": "Hello, Funnel!\n" + } + ], + } + ], +} +``` + +[alpine]: https://hub.docker.com/_/alpine/ diff --git a/docs/tools/funnel/download.md b/docs/tools/funnel/download.md new file mode 100644 index 0000000..5359c76 --- /dev/null +++ b/docs/tools/funnel/download.md @@ -0,0 +1,35 @@ +--- +title: Download +menu: + main: + weight: -2000 +--- + +## Releases + +See the [Releases](https://github.com/ohsu-comp-bio/funnel/releases) page for release history. + + +--8<-- "docs/tools/funnel/_releases.md" + +## Homebrew + +```sh +brew tap ohsu-comp-bio/formula +brew install funnel@0.11 +``` + +## Build the lastest development version + +In order to build the latest code, run: +```shell +$ git clone https://github.com/ohsu-comp-bio/funnel.git +$ cd funnel +$ make +``` + +Funnel requires Go 1.21+. Check out the [development docs][dev] for more detail. + + +[dev]: /docs/development/developers/ +[docker]: https://docker.io diff --git a/docs/tools/funnel/index.md b/docs/tools/funnel/index.md new file mode 100644 index 0000000..4c780e5 --- /dev/null +++ b/docs/tools/funnel/index.md @@ -0,0 +1,197 @@ +--- +title: Funnel +--- + +## Funnel Tool Documentation + +The Funnel tool is designed to streamline data processing workflows, enabling efficient data transformation and analysis. Key features include: + +- **S3 Integration**: Seamlessly add and manage files from Amazon S3. +- **Data Transformation**: Predefined pipelines for common data processing tasks. +- **Automation**: Schedule and automate repetitive data workflows. +- **Monitoring**: Track the status and performance of data jobs in real-time. +- **Workflow engine compatibile**: Compatible with Nextflow + +## Simple API +A task describes metadata, state, input/output files, resource requests, commands, and logs. + +The task API has four actions: create, get, list, and cancel. + +Funnel serves both HTTP/JSON and gRPC/Protobuf. + +##Standards based + +The Task API is developed via an open standard effort. + +## Workers +Given a task, Funnel will queue it, schedule it to a worker, and track its state and logs. + +A worker will download input files, run a sequence of Docker containers, upload output files, and emits events and logs along the way. + +## Cross platform +We use Funnel on AWS, Google Cloud, OpenStack, and the good ol' university HPC cluster. + +## Adaptable + +A wide variety of options make Funnel easily adaptable: + + - BoltDB + - Elasticsearch + - MongoDB + - AWS Batch, S3, DynamoDB + - OpenStack Swift + - Google Cloud Storage, Datastore + - Kafka + - HPC support: HTCondor, Slurm, etc. + - and more + + --- + +# Define a task + +A task describes metadata, state, input/output files, resource requests, commands, and logs. + +For a full description of the task fields, see the task API docs and the the task schema. + + +``` +$ funnel examples hello-world +{ + "name": "Hello world", + "description": "Demonstrates the most basic echo task.", + "executors": [ + { + "image": "alpine", + "command": ["echo", "hello world"], + } + ] +} +``` + +--- + +# Start a Funnel server + +localhost:8000 is the HTTP API and web dashboard. +localhost:9090 is the gRPC API (for internal communication) + + +``` +$ funnel server run +server Server listening +httpPort 8000 +rpcAddress :9090 +``` + + +--- + +# Create a task + +The output is the task ID. + +This example uses the development server, which will run the task locally via Docker. + + +``` +$ funnel examples hello-world > hello-world.json +$ funnel task create hello-world.json +b8581farl6qjjnvdhqn0 +``` + +--- + +# Get the task + +The output is the task with state and logs. + +By default, the CLI returns the "full" task view, which includes all logs plus stdout/err content. + + +``` +$ funnel task get b8581farl6qjjnvdhqn0 +{ + "id": "b8581farl6qjjnvdhqn0", + "state": "COMPLETE", + "name": "Hello world", + "description": "Demonstrates the most basic echo task.", + "executors": [ + { + "image": "alpine", + "command": [ + "echo", + "hello world" + ], + } + ], + "logs": [ + { + "logs": [ + { + "startTime": "2017-11-13T21:35:57.548592769-08:00", + "endTime": "2017-11-13T21:36:01.871905687-08:00", + "stdout": "hello world\n" + } + ], + "startTime": "2017-11-13T21:35:57.547408797-08:00", + "endTime": "2017-11-13T21:36:01.87496482-08:00" + } + ], + "creationTime": "2017-11-13T21:35:57.543528992-08:00" +} +``` + +--- + +List the tasks + + +``` +$ funnel task list --view MINIMAL +{ + "tasks": [ + { + "id": "b8581farl6qjjnvdhqn0", + "state": "COMPLETE" + }, + ... + ] +} +``` + +--- + +# Quickly create tasks + +The "run" command makes it easy to quickly create a task. By default, commands are wrapped in "sh -c" and run in the "alpine" container. + +Use the "--print" flag to print the task instead of running it immediately. + + +``` +$ funnel run 'md5sum $src' --in src=~/src.txt --print +{ + "name": "sh -c 'md5sum $src'", + "inputs": [ + { + "name": "src", + "url": "file:///Users/buchanae/src.txt", + "path": "/inputs/Users/buchanae/src.txt" + } + ], + "executors": [ + { + "image": "alpine", + "command": [ + "sh", + "-c", + "md5sum $src" + ], + "env": { + "src": "/inputs/Users/buchanae/src.txt" + } + } + ], +} + +``` diff --git a/docs/tools/git-drs/.nav.yml b/docs/tools/git-drs/.nav.yml new file mode 100644 index 0000000..f01a642 --- /dev/null +++ b/docs/tools/git-drs/.nav.yml @@ -0,0 +1,5 @@ +title: Git-DRS +nav: + - Overview: index.md + - Installation: installation.md + - Commands: commands.md diff --git a/docs/tools/git-drs/adding-s3-files.md b/docs/tools/git-drs/adding-s3-files.md new file mode 100644 index 0000000..c44d6a7 --- /dev/null +++ b/docs/tools/git-drs/adding-s3-files.md @@ -0,0 +1,209 @@ +# Adding S3 Files to Git DRS + +The `git drs add-url` command allows you to associate an S3 URL with a Git DRS repository without moving the actual data. This command registers the S3 file location in the Gen3 indexd service and creates a Git LFS pointer file. + +## Use Cases + +There are two main use cases for adding S3 files: + +### 1. Adding S3 Files from Gen3-Registered Buckets +If the S3 bucket is already registered in Gen3, the system can automatically retrieve the region and endpoint information from the Gen3 configuration. You only need to supply AWS credentials. + +### 2. Adding S3 Files from Non-Registered Buckets +If the S3 bucket is not registered in Gen3, you must provide both AWS credentials and bucket configuration (region and endpoint URL). + +## AWS Configuration + +This command follows the standard AWS CLI authentication and configuration precedence as documented in the [AWS CLI Authentication Guide](https://docs.aws.amazon.com/cli/v1/userguide/cli-chap-authentication.html) + +### Configuration Priority (Highest to Lowest) + +1. **Command-line flags**: `--aws-access-key-id`, `--aws-secret-access-key`, `--region`, `--endpoint-url` +2. **Environment variables**: `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, `AWS_REGION`, `AWS_ENDPOINT_URL` +3. **AWS configuration files**: `~/.aws/credentials` first, then `~/.aws/config` +4. **Gen3 bucket registration**: For registered buckets, region and endpoint are retrieved from Gen3 +5. **IAM roles**: For EC2 instances or containers with attached IAM roles + +See the [AWS CLI Configuration Guide](https://github.com/aws/aws-cli#configuration) for the various ways to set up your credentials. + +## Prerequisites + +- Git LFS tracking must be configured for the file +- AWS credentials with read access to the S3 bucket +- For non-registered buckets: AWS region and endpoint URL + +## Command Syntax + +```bash +git drs add-url s3://bucket/path/to/file --sha256 [options] +``` + +### Required Parameters + +- `s3://bucket/path/to/file`: The S3 URL of the file to be added +- `--sha256 `: The SHA256 hash of the file (64-character hexadecimal string) + +### Optional Parameters + +- `--aws-access-key-id `: AWS access key for authentication +- `--aws-secret-access-key `: AWS secret key for authentication +- `--region `: AWS region (e.g., `us-west-2`, `us-east-1`) + - Required for buckets not registered in Gen3 (unless configured in AWS config file) +- `--endpoint-url `: S3 endpoint URL (e.g., `https://s3.example.com`) + - Required for buckets not registered in Gen3 (unless configured in AWS config file) + +## Examples + +### Example 1: Gen3-Registered Bucket with Command-Line Credentials + +If your bucket is registered in Gen3, you only need to provide AWS credentials: + +```bash +# Track the file with Git LFS +git lfs track "my-file" +git add .gitattributes + +# Add the S3 file using command-line credentials +git drs add-url s3://my-registered-bucket/path/to/my-file \ + --sha256 1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef \ + --aws-access-key-id myAccessKey \ + --aws-secret-access-key mySecretKey + +# Commit and push +git commit -m "Add file from registered bucket" +git push +``` + +### Example 2: Gen3-Registered Bucket with Environment Variables + +```bash +# Set AWS credentials via environment variables +export AWS_ACCESS_KEY_ID=myAccessKey +export AWS_SECRET_ACCESS_KEY=mySecretKey + +# Track the file with Git LFS +git lfs track "my-file" +git add .gitattributes + +# Add the S3 file (credentials from environment) +git drs add-url s3://my-registered-bucket/path/to/my-file \ + --sha256 1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef + +# Commit and push +git commit -m "Add file from registered bucket" +git push +``` + +### Example 3: Non-Registered Bucket with Command-Line Credentials + +For buckets not registered in Gen3, provide region and endpoint: + +```bash +# Set credentials via environment variables +export AWS_ACCESS_KEY_ID=myAccessKey +export AWS_SECRET_ACCESS_KEY=mySecretKey + +# Track the file with Git LFS +git lfs track "my-file" +git add .gitattributes + +# Add the S3 file with region and endpoint +git drs add-url s3://my-custom-bucket/path/to/my-file \ + --sha256 1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef \ + --region us-west-2 \ + --endpoint-url https://s3.custom-provider.com + +# Commit and push +git commit -m "Add file from custom bucket" +git push +``` + +### Example 4: Non-Registered Bucket with AWS Configuration Files + +You can also configure AWS credentials and settings in `~/.aws/credentials` and `~/.aws/config`: + +**~/.aws/credentials:** +```ini +[default] +aws_access_key_id = myAccessKey +aws_secret_access_key = mySecretKey +``` + +**~/.aws/config:** +```ini +[default] +region = us-west-2 +s3 = + endpoint_url = https://s3.custom-provider.com +``` + +Then run the command without any credential flags: + +```bash +git lfs track "my-file" +git add .gitattributes + +# Credentials and configuration loaded from ~/.aws/ files +git drs add-url s3://my-bucket/path/to/my-file \ + --sha256 1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef + +git commit -m "Add file using AWS config files" +git push +``` + +### Example 5: Multiple Files from Registered Bucket + +```bash +# Track all files in a directory +git lfs track "data-directory/**" +git add .gitattributes + +# Set credentials once +export AWS_ACCESS_KEY_ID=myAccessKey +export AWS_SECRET_ACCESS_KEY=mySecretKey + +# Add multiple files +git drs add-url s3://my-bucket/data-directory/file-1.dat \ + --sha256 1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef + +git drs add-url s3://my-bucket/data-directory/subdir/file-2.dat \ + --sha256 abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890 + +git drs add-url s3://my-bucket/data-directory/file-3.dat \ + --sha256 fedcba0987654321fedcba0987654321fedcba0987654321fedcba0987654321 + +# Commit all at once +git commit -m "Add multiple data files" +git push +``` + + +## Notes + +- **Git LFS Tracking**: Files must be tracked by Git LFS before running `add-url`. Use `git lfs track ` to configure tracking. +- **SHA256 Hash**: You must calculate the SHA256 hash of your file beforehand. Use `shasum -a 256 ` or similar tools. +- **Credentials Security**: Avoid putting credentials directly in command-line history. Use environment variables or AWS configuration files. +- **Bucket Registration**: For frequently used buckets, consider registering them in Gen3 to simplify the process. +- **Multiple URLs**: If a file is already registered, running `add-url` with a different S3 URL will add that URL to the existing record. +- **Project Isolation**: Each Git DRS project maintains separate indexd records, even for identical file hashes. + +## Troubleshooting + +### "file is not tracked by LFS" +Run `git lfs track ` to track the file pattern, then `git add .gitattributes`. + +### "Unable to get bucket details" +This means the bucket is not registered in Gen3. Provide `--region` and `--endpoint-url` flags or configure them in your AWS config file. + +### "unable to load AWS SDK config" +Check your AWS configuration: +- Verify credentials are set (via flags, environment, or `~/.aws/credentials`) +- Ensure `~/.aws/config` file is valid if you're using it +- Check that IAM roles are properly configured if running on EC2/ECS + +### "failed to head object" +This usually means: +- Credentials don't have permission to access the object +- The S3 URL is incorrect +- The endpoint or region is misconfigured +- Network connectivity issues diff --git a/docs/tools/git-drs/commands.md b/docs/tools/git-drs/commands.md new file mode 100644 index 0000000..9f05f80 --- /dev/null +++ b/docs/tools/git-drs/commands.md @@ -0,0 +1,507 @@ +# Commands Reference + +Complete reference for Git DRS and related Git LFS commands. + +> **Navigation:** [Getting Started](getting-started.md) → **Commands Reference** → [Troubleshooting](troubleshooting.md) + +## Git DRS Commands + +### `git drs init` + +Initialize Git DRS in a repository. Sets up Git LFS custom transfer hooks and configures `.gitignore` patterns. + +**Usage:** + +```bash +git drs init [flags] +``` + +**Options:** + +- `--transfers `: Number of concurrent transfers (default: 4) + +**Example:** + +```bash +git drs init +``` + +**What it does:** + +- Initializes local configuration +- Configures Git LFS custom transfer agent +- Updates `.gitignore` to exclude DRS cache files +- Stages `.gitignore` changes automatically + +**Note:** Run this before adding remotes. + +### `git drs remote` + +Manage DRS remote server configurations. Git DRS supports multiple remotes for working with development, staging, and production servers. + +#### `git drs remote add gen3 ` + +Add a Gen3 DRS server configuration. + +**Usage:** + +```bash +git drs remote add gen3 \ + --url \ + --cred \ + --project \ + --bucket +``` + +**Options:** + +- `--url `: Gen3 server endpoint (required) +- `--cred `: Path to credentials JSON file (required) +- `--token `: Token for temporary access (alternative to --cred) +- `--project `: Project ID in format `-` (required) +- `--bucket `: S3 bucket name (required) + +**Examples:** + +```bash +# Add production remote +git drs remote add gen3 production \ + --url https://calypr-public.ohsu.edu \ + --cred /path/to/credentials.json \ + --project my-project \ + --bucket my-bucket + +# Add staging remote +git drs remote add gen3 staging \ + --url https://staging.calypr.ohsu.edu \ + --cred /path/to/staging-credentials.json \ + --project staging-project \ + --bucket staging-bucket +``` + +**Note:** The first remote you add automatically becomes the default remote. + +#### `git drs remote add anvil ` + +Add an AnVIL/Terra DRS server configuration. + +> **Note:** AnVIL support is under active development. For production use, we recommend Gen3 workflows or version 0.2.2 for AnVIL functionality. + +**Usage:** + +```bash +git drs remote add anvil --terraProject +``` + +**Options:** + +- `--terraProject `: Terra/Google Cloud project ID (required) + +**Example:** + +```bash +git drs remote add anvil development --terraProject my-terra-project +``` + +#### `git drs remote list` + +List all configured DRS remotes. + +**Usage:** + +```bash +git drs remote list +``` + +**Example Output:** + +``` +* production gen3 https://calypr-public.ohsu.edu + staging gen3 https://staging.calypr.ohsu.edu + development gen3 https://dev.calypr.ohsu.edu +``` + +The `*` indicates the default remote used by all commands unless specified otherwise. + +#### `git drs remote set ` + +Set the default DRS remote for all operations. + +**Usage:** + +```bash +git drs remote set +``` + +**Examples:** + +```bash +# Switch to staging for testing +git drs remote set staging + +# Switch back to production +git drs remote set production + +# Verify change +git drs remote list +``` + +### `git drs fetch [remote-name]` + +Fetch DRS object metadata from remote server. Downloads metadata only, not actual files. + +**Usage:** + +```bash +# Fetch from default remote +git drs fetch + +# Fetch from specific remote +git drs fetch staging +git drs fetch production +``` + +**Note:** `fetch` and `push` are commonly used together for cross-remote workflows. See `git drs push` below. + +**What it does:** + +- Identifies remote and project from configuration +- Synchronizes all DRS records for a given project from the server to the local repository + +### `git drs push [remote-name]` + +Push local DRS objects to server. Uploads new files and registers metadata. + +**Usage:** + +```bash +# Push to default remote +git drs push + +# Push to specific remote +git drs push staging +git drs push production +``` + +**What it does:** + +- Checks local repository for DRS metadata +- For each object, uploads file to bucket if file exists locally +- If file doesn't exist locally (metadata only), registers metadata without upload +- This enables cross-remote promotion workflows + +**Cross-Remote Promotion:** + +Transfer DRS records from one remote to another (eg staging to production) without re-uploading files: + +```bash +# Fetch metadata from staging +git drs fetch staging + +# Push metadata to production (no file upload since files don't exist locally) +git drs push production +``` + +This is useful when files are already in the production bucket with matching SHA256 hashes. It can also be used to reupload files given that the files are pulled to the repo first. + +**Note:** `fetch` and `push` are commonly used together. `fetch` pulls metadata from one remote, `push` registers it to another. + +### `git drs add-url` + +Add a file reference via S3 URL without copying the data. + +**Usage:** + +```bash +# Use default remote +git drs add-url s3://bucket/path/file --sha256 + +# Use specific remote +git drs add-url s3://bucket/path/file --sha256 --remote staging +``` + +**With AWS Credentials:** + +```bash +git drs add-url s3://bucket/path/file \ + --sha256 \ + --aws-access-key \ + --aws-secret-key +``` + +**Options:** + +- `--sha256 `: Required SHA256 hash of the file +- `--remote `: Target remote (default: default_remote) +- `--aws-access-key `: AWS access key +- `--aws-secret-key `: AWS secret key +- `--endpoint `: Custom S3 endpoint +- `--region `: AWS region + +### `git drs create-cache` + +Create a cache from a manifest file (Terra/AnVIL). + +```bash +git drs create-cache manifest.tsv +``` + +### `git drs version` + +Display Git DRS version information. + +```bash +git drs version +``` + +### Internal Commands + +These commands are called automatically by Git hooks: + +- `git drs precommit`: Process staged files during commit +- `git drs transfer`: Handle file transfers during push/pull +- `git drs transferref`: Handle reference transfers (AnVIL/Terra) + +## Git LFS Commands + +### `git lfs track` + +Manage file tracking patterns. + +**View Tracked Patterns:** + +```bash +git lfs track +``` + +**Track New Pattern:** + +```bash +git lfs track "*.bam" +git lfs track "data/**" +git lfs track "specific-file.txt" +``` + +**Untrack Pattern:** + +```bash +git lfs untrack "*.bam" +``` + +### `git lfs ls-files` + +List LFS-tracked files in the repository. + +**All Files:** + +```bash +git lfs ls-files +``` + +**Specific Pattern:** + +```bash +git lfs ls-files -I "*.bam" +git lfs ls-files -I "data/**" +``` + +**Output Format:** + +- `*` prefix: File is localized (downloaded) +- `-` prefix: File is not localized +- No prefix: File status unknown + +### `git lfs pull` + +Download LFS-tracked files. + +**All Files:** + +```bash +git lfs pull +``` + +**Specific Files:** + +```bash +git lfs pull -I "*.bam" +git lfs pull -I "data/important.txt" +git lfs pull -I "results/**" +``` + +**Multiple Patterns:** + +```bash +git lfs pull -I "*.bam" -I "*.vcf" +``` + +### `git lfs install` + +Configure Git LFS for the system or repository. + +**System-wide:** + +```bash +git lfs install --skip-smudge +``` + +**Repository-only:** + +```bash +git lfs install --local --skip-smudge +``` + +The `--skip-smudge` option prevents automatic downloading of all LFS files during clone/checkout. + +## Standard Git Commands + +Git DRS integrates with standard Git commands: + +### `git add` + +Stage files for commit. LFS-tracked files are automatically processed. + +```bash +git add myfile.bam +git add data/ +git add . +``` + +### `git commit` + +Commit changes. Git DRS pre-commit hook runs automatically. + +```bash +git commit -m "Add new data files" +``` + +### `git push` + +Push commits to remote. Git DRS automatically uploads new files to DRS server. + +```bash +git push +git push origin main +``` + +### `git clone` + +Clone repository. Use with Git DRS initialization: + +```bash +git clone +cd +git drs init +git drs remote add gen3 production --cred /path/to/credentials.json --url ... --project ... --bucket ... +``` + +## Workflow Examples + +### Complete File Addition Workflow + +```bash +# 1. Ensure file type is tracked +git lfs track "*.bam" +git add .gitattributes + +# 2. Add your file +git add mydata.bam + +# 3. Verify tracking +git lfs ls-files -I "mydata.bam" + +# 4. Commit (creates DRS record) +git commit -m "Add analysis results" + +# 5. Push (uploads to default DRS server) +git push +``` + +### Selective File Download + +```bash +# Check what's available +git lfs ls-files + +# Download specific files +git lfs pull -I "results/*.txt" +git lfs pull -I "important-dataset.bam" + +# Verify download +git lfs ls-files -I "results/*.txt" +``` + +### Repository Setup from Scratch + +```bash +# 1. Create and clone repo +git clone +cd + +# 2. Initialize Git DRS +git drs init + +# 3. Add DRS remote +git drs remote add gen3 production \ + --url https://calypr-public.ohsu.edu \ + --cred /path/to/credentials.json \ + --project my-project \ + --bucket my-bucket + +# 4. Set up file tracking +git lfs track "*.bam" +git lfs track "*.vcf.gz" +git lfs track "data/**" +git add .gitattributes +git commit -m "Configure LFS tracking" +git push + +# 5. Add data files +git add data/sample1.bam +git commit -m "Add sample data" +git push +``` + +### Cross-Remote Promotion Workflow + +```bash +# 1. Add multiple remotes +git drs remote add gen3 staging \ + --url https://staging.calypr.ohsu.edu \ + --cred /path/to/staging-credentials.json \ + --project staging-project \ + --bucket staging-bucket + +git drs remote add gen3 production \ + --url https://calypr-public.ohsu.edu \ + --cred /path/to/prod-credentials.json \ + --project prod-project \ + --bucket prod-bucket + +# 2. Fetch metadata from staging +git drs fetch staging + +# 3. Push metadata to production (no re-upload) +git drs push production +``` + +## Environment Variables + +Git DRS respects these environment variables: + +- `AWS_ACCESS_KEY_ID`: AWS access key (for S3 operations) +- `AWS_SECRET_ACCESS_KEY`: AWS secret key (for S3 operations) +- `GOOGLE_PROJECT`: Google Cloud project ID (for AnVIL) +- `WORKSPACE_BUCKET`: Terra workspace bucket (for AnVIL) + +## Help and Documentation + +Use `--help` with any command for detailed usage: + +```bash +git-drs --help +git-drs init --help +git-drs add-url --help +git lfs --help +git lfs track --help +``` diff --git a/docs/tools/git-drs/developer-guide.md b/docs/tools/git-drs/developer-guide.md new file mode 100644 index 0000000..1c6ad6d --- /dev/null +++ b/docs/tools/git-drs/developer-guide.md @@ -0,0 +1,185 @@ +# Developer Guide + +This guide covers Git DRS internals, architecture, and development information. + +## Architecture Overview + +Git DRS integrates with Git through several mechanisms: + +### Git Hooks Integration + +**Pre-commit Hook**: `git drs precommit` +- Triggered automatically before each commit +- Processes all staged LFS files +- Creates DRS records for new files +- Only processes files that don't already exist on the DRS server +- Prepares metadata for later upload during push + +**Custom Transfer Protocol** +- Git LFS uses custom transfers to communicate with Git DRS +- Handles both upload (push) and download (pull) operations +- Transfers run automatically during `git push` and `git lfs pull` +- Information passed through JSON protocol between Git LFS and Git DRS + +### File Processing Flow + +``` +1. Developer: git add file.bam +2. Developer: git commit -m "Add data" +3. Git Hook: git drs precommit + - Creates DRS object metadata + - Stores metadata in repository local state +4. Developer: git push +5. Git LFS: Initiates custom transfer +6. Git DRS: + - Registers file with DRS server (indexd record) + - Uploads file to configured bucket + - Updates transfer logs +``` + +## Custom Transfer Protocol + +Git DRS implements the [Git LFS Custom Transfer Protocol](https://github.com/git-lfs/git-lfs/blob/main/docs/custom-transfers.md). + +### Transfer Types + +**Upload Transfer (gen3)**: +- Creates indexd record on DRS server +- Uploads file to Gen3-registered S3 bucket +- Updates DRS object with access URLs + +**Download Transfer (gen3)**: +- Retrieves file metadata from DRS server +- Downloads file from configured storage +- Validates checksums + +**Reference Transfer**: +- Handles S3 URL references without data movement +- Links existing S3 objects to DRS records + +### Protocol Communication + +Git LFS and Git DRS communicate via JSON messages: + +```json +{ + "event": "init", + "operation": "upload", + "remote": "origin", + "concurrent": 3, + "concurrenttransfers": 3 +} +``` + +Response handling and logging occurs in transfer clients to avoid interfering with Git LFS stdout expectations. + +## Repository Structure + +### Core Components + +``` +cmd/ # CLI command implementations +├── initialize/ # Repository initialization +├── transfer/ # Custom transfer handlers +├── precommit/ # Pre-commit hook +├── addurl/ # S3 URL reference handling +└── ... + +client/ # DRS client implementations +├── interface.go # Client interface definitions +├── indexd.go # Gen3/indexd client +├── anvil.go # AnVIL client +└── drs-map.go # File mapping utilities + +config/ # Configuration management +└── config.go # Config file handling + +drs/ # DRS object utilities +├── object.go # DRS object structures +└── util.go # Utility functions + +lfs/ # Git LFS integration +└── messages.go # LFS protocol messages + +utils/ # Shared utilities +├── common.go # Common functions +├── lfs-track.go # LFS tracking utilities +└── util.go # General utilities +``` + +### Configuration System + +**Repository Configuration**: Git local configuration (.git/config) +```yaml +current_server: gen3 +servers: + gen3: + endpoint: "https://data.example.org/" + profile: "myprofile" + project: "project-123" + bucket: "data-bucket" +``` + +### DRS Object Management + +Objects are managed as local Git state during pre-commit and referenced during transfers. + +## Development Setup + +### Prerequisites + +- Go 1.24+ +- Git LFS installed +- Access to a DRS server for testing + +### Building from Source + +```bash +# Clone repository +git clone https://github.com/calypr/git-drs.git +cd git-drs + +# Install dependencies +go mod download + +# Build +go build + +# Install locally +export PATH=$PATH:$(pwd) +``` + +### Development Workflow + +1. **Make changes** to source code +2. **Build and test**: + ```bash + go build + go test ./... + ``` +3. **Test with real repository**: + ```bash + cd /path/to/test-repo + /path/to/git-drs/git-drs --help + ``` + +## Debugging and Logging + +### Log Locations + +- **Commit logs**: Repository system logs +- **Transfer logs**: Repository system logs + + +## Testing + +### Unit Tests + +```bash +# Test specific functionality +go test ./utils -run TestLFSTrack +``` + +### Integration Tests + +**WIP** \ No newline at end of file diff --git a/docs/tools/git-drs/getting-started.md b/docs/tools/git-drs/getting-started.md new file mode 100644 index 0000000..9b43e1e --- /dev/null +++ b/docs/tools/git-drs/getting-started.md @@ -0,0 +1,321 @@ +# Getting Started + +This guide walks you through setting up Git DRS and performing common workflows. + +> **Navigation:** [Installation](installation.md) → **Getting Started** → [Commands Reference](commands.md) → [Troubleshooting](troubleshooting.md) + +## Repository Initialization + +Every Git repository using Git DRS requires configuration, whether you're creating a new repo or cloning an existing one. + +### Cloning Existing Repository (Gen3) + +1. **Clone the Repository** + + ```bash + git clone .git + cd + ``` + +2. **Configure SSH** (if using SSH URLs) + + If using SSH URLs like `git@github.com:user/repo.git`, add to `~/.ssh/config`: + + ``` + Host github.com + TCPKeepAlive yes + ServerAliveInterval 30 + ``` + +3. **Get Credentials** + + - Log in to your data commons (e.g., https://calypr-public.ohsu.edu/) + - Profile → Create API Key → Download JSON + - **Note**: Credentials expire after 30 days + +4. **Initialize Repository** + + ```bash + git drs init + ``` + +5. **Verify Configuration** + + ```bash + git drs remote list + ``` + + Output: + ``` + * production gen3 https://calypr-public.ohsu.edu/ + ``` + + The `*` indicates this is the default remote. + +### New Repository Setup (Gen3) + +1. **Create and Clone Repository** + + ```bash + git clone .git + cd + ``` + +2. **Configure SSH** (if needed - same as above) + +3. **Get Credentials** (same as above) + +4. **Get Project Details** + + Contact your data coordinator for: + - DRS server URL + - Project ID + - Bucket name + +5. **Initialize Git DRS** + + ```bash + git drs init + ``` + +6. **Add Remote Configuration** + + ```bash + git drs remote add gen3 production \ + --cred /path/to/credentials.json \ + --url https://calypr-public.ohsu.edu \ + --project my-project \ + --bucket my-bucket + ``` + + **Note:** Since this is your first remote, it automatically becomes the default. No need to run `git drs remote set`. + +7. **Verify Configuration** + + ```bash + git drs remote list + ``` + + Output: + ``` + * production gen3 https://calypr-public.ohsu.edu + ``` + +**Managing Additional Remotes** + +You can add more remotes later for multi-environment workflows (development, staging, production): + +```bash +# Add staging remote +git drs remote add gen3 staging \ + --cred /path/to/staging-credentials.json \ + --url https://staging.calypr.ohsu.edu \ + --project staging-project \ + --bucket staging-bucket + +# View all remotes +git drs remote list + +# Switch default remote +git drs remote set staging + +# Or use specific remote for one command +git drs push production +git drs fetch staging +``` + +## File Tracking + +Git DRS uses Git LFS to track files. You must explicitly track file patterns before adding them. + +### View Current Tracking + +```bash +git lfs track +``` + +### Track Files + +**Single File** + +```bash +git lfs track path/to/specific-file.txt +git add .gitattributes +``` + +**File Pattern** + +```bash +git lfs track "*.bam" +git add .gitattributes +``` + +**Directory** + +```bash +git lfs track "data/**" +git add .gitattributes +``` + +### Untrack Files + +```bash +# View tracked patterns +git lfs track + +# Remove pattern +git lfs untrack "*.bam" + +# Stage changes +git add .gitattributes +``` + +## Basic Workflows + +### Adding and Pushing Files + +```bash +# Track file type (if not already tracked) +git lfs track "*.bam" +git add .gitattributes + +# Add your file +git add myfile.bam + +# Verify LFS is tracking it +git lfs ls-files + +# Commit and push +git commit -m "Add new data file" +git push +``` + +> **Note**: Git DRS automatically creates DRS records during commit and uploads files to the default remote during push. + +### Downloading Files + +**Single File** + +```bash +git lfs pull -I path/to/file.bam +``` + +**Pattern** + +```bash +git lfs pull -I "*.bam" +``` + +**All Files** + +```bash +git lfs pull +``` + +**Directory** + +```bash +git lfs pull -I "data/**" +``` + +### Checking File Status + +```bash +# List all LFS-tracked files +git lfs ls-files + +# Check specific pattern +git lfs ls-files -I "*.bam" + +# View localization status +# (-) = not localized, (*) = localized +git lfs ls-files +``` + +## Working with S3 Files + +You can add references to existing S3 files without copying them: + +```bash +# Track the file pattern first +git lfs track "myfile.txt" +git add .gitattributes + +# Add S3 reference +git drs add-url s3://bucket/path/to/file \ + --sha256 \ + --aws-access-key \ + --aws-secret-key + +# Commit and push +git commit -m "Add S3 file reference" +git push +``` + +See [S3 Integration Guide](adding-s3-files.md) for detailed examples. + +## Configuration Management + +### View Configuration + +```bash +git drs remote list +``` + +### Update Configuration + +```bash +# Refresh credentials - re-add remote with new credentials +git drs remote add gen3 production \ + --cred /path/to/new-credentials.json \ + --url https://calypr-public.ohsu.edu \ + --project my-project \ + --bucket my-bucket + +# Switch default remote +git drs remote set staging +``` + +# +## Command Summary + +| Action | Commands | +| ------------------ | ------------------------------------------- | +| **Initialize** | `git drs init` | +| **Add remote** | `git drs remote add gen3 --cred...` | +| **View remotes** | `git drs remote list` | +| **Set default** | `git drs remote set ` | +| **Track files** | `git lfs track "pattern"` | +| **Check tracked** | `git lfs ls-files` | +| **Add files** | `git add file.ext` | +| **Commit** | `git commit -m "message"` | +| **Push** | `git push` | +| **Download** | `git lfs pull -I "pattern"` | + +## Session Workflow + +For each work session: + +1. **Refresh credentials** (if expired - credentials expire after 30 days) + + ```bash + git drs remote add gen3 production \ + --cred /path/to/new-credentials.json \ + --url https://calypr-public.ohsu.edu \ + --project my-project \ + --bucket my-bucket + ``` + +2. **Work with files** (track, add, commit, push) + +3. **Download files as needed** + + ```bash + git lfs pull -I "required-files*" + ``` + +## Next Steps + +- [Commands Reference](commands.md) - Complete command documentation +- [Troubleshooting](troubleshooting.md) - Common issues and solutions +- [Developer Guide](developer-guide.md) - Advanced usage and internals diff --git a/docs/tools/git-drs/index.md b/docs/tools/git-drs/index.md new file mode 100644 index 0000000..91f0c8d --- /dev/null +++ b/docs/tools/git-drs/index.md @@ -0,0 +1,69 @@ +--- +title: Git-DRS +--- + +# Git-DRS + +Git-DRS is a Git extension for managing large files in a Gen3 Data Commons using the **Data Repository Service (DRS)** content-addressable storage model. It essentially serves as a Git-LFS (Large File Storage) replacement tailored for Gen3. + +It allows you to: +- Track large files in your git repository without bloating it. +- Store the actual file contents in a Gen3 data commons (indexed via DRS). +- Seamlessly synchronize files between your local environment and the commons. + +## Installation + +Ensure `git-drs` is installed and in your PATH. + +## Initialization + +Initialize a repository to use Git-DRS. This sets up the necessary hooks and configuration. + +```bash +git-drs init +``` + +## Basic Workflow + +1. **Add a file**: Track a large file with Git-DRS. + ```bash + git-drs add + ``` + This replaces the large file with a small pointer file in your working directory. + +2. **Push**: Upload the tracked files to the Gen3 Commons. + ```bash + git-drs push + ``` + +3. **Fetch**: Download file contents (resolving pointer files) from the Commons. + ```bash + git-drs fetch + ``` + +## Command Reference + +### `init` +Initializes `git-drs` in the current git repository. Recommended to run at the root of the repo. + +### `add ` +Tracks a file using Git-DRS. The file content is moved to a local cache, and replaced with a pointer file containing its hash and size. + +### `push` +Uploads the contents of tracked files to the configured Gen3 Commons. This usually happens automatically during `git push` if hooks are configured, but can be run manually. + +### `fetch` +Downloads the contents of tracked files from the Gen3 Commons, replacing the local pointer files with the actual data. + +### `add-url ` +Associates a specific URL with a GUID in the local cache or for registration purposes. + +### `list` +Lists the files currently tracked by Git-DRS in the project. + +### `remote` +Manage remote DRS server configurations. + +```bash +git-drs remote add +``` diff --git a/docs/tools/git-drs/installation.md b/docs/tools/git-drs/installation.md new file mode 100644 index 0000000..bc1c85a --- /dev/null +++ b/docs/tools/git-drs/installation.md @@ -0,0 +1,213 @@ +# Installation Guide + +This guide covers installation of Git DRS across different environments and target DRS servers. + +## Prerequisites + +All installations require [Git LFS](https://git-lfs.com/) to be installed first: + +```bash +# macOS +brew install git-lfs + +# Linux (download binary) +wget https://github.com/git-lfs/git-lfs/releases/download/v3.7.0/git-lfs-linux-amd64-v3.7.0.tar.gz +tar -xvf git-lfs-linux-amd64-v3.7.0.tar.gz +export PREFIX=$HOME +./git-lfs-v3.7.0/install.sh + +# Configure LFS +git lfs install --skip-smudge +``` + +## Local Installation (Gen3 Server) + +**Target Environment**: Local development machine targeting Gen3 data commons (e.g., CALYPR) + +### Steps + +1. **Install Git DRS** + ```bash + /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/calypr/git-drs/refs/heads/main/install.sh)" + ``` + +2. **Update PATH** + ```bash + # Add to ~/.bash_profile or ~/.zshrc + export PATH="$PATH:$HOME/.local/bin" + source ~/.bash_profile # or source ~/.zshrc + ``` + +3. **Verify Installation** + ```bash + git-drs --help + ``` + +4. **Get Credentials** + - Log in to your data commons (e.g., https://calypr-public.ohsu.edu/) + - Click your email → Profile → Create API Key → Download JSON + - Note the download path for later configuration + +## HPC Installation (Gen3 Server) + +**Target Environment**: High-performance computing systems targeting Gen3 servers + +### Steps + +1. **Install Git LFS on HPC** + ```bash + # Download and install Git LFS + wget https://github.com/git-lfs/git-lfs/releases/download/v3.7.1/git-lfs-linux-amd64-v3.7.1.tar.gz + tar -xvf git-lfs-linux-amd64-v3.7.1.tar.gz + export PREFIX=$HOME + ./git-lfs-3.7.1/install.sh + + # Make permanent + echo 'export PATH="$HOME/bin:$PATH"' >> ~/.bash_profile + source ~/.bash_profile + + # Configure + git lfs install --skip-smudge + + # Cleanup + rm git-lfs-linux-amd64-v3.7.0.tar.gz + rm -r git-lfs-3.7.0/ + ``` + +2. **Configure Git/SSH (if needed)** + ```bash + # Generate SSH key + ssh-keygen -t ed25519 -C "your_email@example.com" + + # Add to ssh-agent + eval "$(ssh-agent -s)" + ssh-add ~/.ssh/id_ed25519 + + # Add public key to GitHub/GitLab + cat ~/.ssh/id_ed25519.pub + ``` + +3. **Install Git DRS** + ```bash + /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/calypr/git-drs/refs/heads/fix/install-error-macos/install.sh)" + + # Update PATH + echo 'export PATH="$PATH:$HOME/.local/bin"' >> ~/.bash_profile + source ~/.bash_profile + ``` + +4. **Verify Installation** + ```bash + git-drs version + ``` + +## Terra/Jupyter Installation (AnVIL Server) + +**Target Environment**: Terra Jupyter notebooks targeting AnVIL DRS servers + +### Steps + +1. **Launch Jupyter Environment** in Terra + +2. **Open Terminal** in Jupyter + +3. **Install Dependencies** + ```bash + # Install Git DRS + /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/calypr/git-drs/refs/heads/fix/install-error-macos/install.sh)" + + # Install DRS Downloader + wget https://github.com/anvilproject/drs_downloader/releases/download/0.1.6-rc.4/drs_downloader + chmod 755 drs_downloader + ``` + +4. **Verify Installation** + ```bash + git-drs --help + drs_downloader --help + ``` + +5. **Example Workflow** + ```bash + # Clone example repository + git clone https://github.com/quinnwai/super-cool-anvil-analysis.git + cd super-cool-anvil-analysis/ + + # Initialize and configure for your Terra project + git drs init + git drs remote add anvil development --terraProject $GOOGLE_PROJECT + + # Work with manifests + gsutil cp $WORKSPACE_BUCKET/anvil-manifest.tsv . + git drs create-cache anvil-manifest.tsv + + # List and pull files + git lfs ls-files + git lfs pull -I data_tables_sequencing_dataset.tsv + ``` + +## Local Installation (AnVIL Server) + +**Target Environment**: Local development machine targeting AnVIL servers + +### Steps + +1. **Install Git DRS** (same as Gen3 local installation) + ```bash + /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/calypr/git-drs/refs/heads/fix/install-error-macos/install.sh)" + ``` + +2. **Get Terra Project ID** + - Log in to [AnVIL Workspaces](https://anvil.terra.bio/#workspaces) + - Select your workspace + - Copy the Google Project ID from "CLOUD INFORMATION" + +3. **Configure AnVIL Access** + ```bash + # Check existing configuration + git drs remote list + + # If no AnVIL server configured, add it + git drs init + git drs remote add anvil development --terraProject + ``` + +## Build from Source + +For development or custom builds: + +```bash +# Clone repository +git clone https://github.com/calypr/git-drs.git +cd git-drs + +# Build +go build + +# Make accessible +export PATH=$PATH:$(pwd) +``` + +## Post-Installation + +After installation, verify your setup: + +```bash +# Check Git DRS version +git-drs version + +# Check Git LFS +git lfs version + +# View configured remotes (after setup) +git drs remote list +``` + +## Next Steps + +After installation, see: + +> **Navigation:** [Installation](installation.md) → [Getting Started](getting-started.md) → [Commands Reference](commands.md) + +- **[Getting Started](getting-started.md)** - Repository setup and basic workflows +- **[Commands Reference](commands.md)** - Complete command documentation diff --git a/docs/tools/git-drs/remove-files.md b/docs/tools/git-drs/remove-files.md new file mode 100644 index 0000000..ddb03ec --- /dev/null +++ b/docs/tools/git-drs/remove-files.md @@ -0,0 +1,84 @@ +--- +title: Removing Files +--- + +## 🗑️ Deleting Files and Updating Metadata + +When removing data files from your project, it's crucial to also update the manifest and associated metadata to maintain consistency. + +### 1. Remove File(s) Using `git rm` + +Use the `git rm` command to delete files and automatically update the manifest and metadata: + +```bash +git rm DATA/subject-123/vcf/sample1.vcf.gz +``` + +This command performs the following actions: + +- Removes the corresponding `entry` from `MANIFEST/`. + +!!! note + It will not: + + - Delete the specified `data` file. + - Update or remove related metadata in the `META/` directory. + +### 2. Review Changes + +After removing files, check the status of your project to see the staged changes: + +```bash +git status +``` + +This will display the files marked for deletion and any updates to the manifest. + +### 3. Update Metadata + +If you need to regenerate the metadata after file deletions, use the `forge meta init` command: + +```bash +forge meta init +``` + +!!! note + This command rebuilds the `META/` directory based on the current state of the repository, ensuring that your metadata accurately reflects the existing data files. + + If you have customized the metadata, you will need to manually remove the affected DocumentReference entries before running this command to avoid conflicts or inconsistencies. + +### 4. Commit Changes + +Once you've reviewed the changes, commit them to your local repository: + +```bash +git commit -m "Removed sample1.vcf.gz and updated associated metadata" +``` + +--- + +## 🚀 Pushing Updates to the Platform + +After committing your changes, push them to the CALYPR platform. + +### 1. Push Changes + +Use the `git push` command (which triggers the `git-drs` transfer hooks) to upload your changes: + +```bash +git push +``` + +If you need to perform metadata registration specifically, you can use `git drs push`. + +--- + +## 📌 Best Practices + +- Always use `git rm` to delete files to ensure that the Git state is properly updated. +- Use `forge meta init` to regenerate metadata when necessary, especially after significant changes to your data files. +- Regularly review your remote repository after pushing changes to confirm successful updates. + +--- + +By following these steps, you can maintain a consistent and accurate state across your data, manifest, and metadata in your CALYPR project. diff --git a/docs/tools/git-drs/troubleshooting.md b/docs/tools/git-drs/troubleshooting.md new file mode 100644 index 0000000..556338e --- /dev/null +++ b/docs/tools/git-drs/troubleshooting.md @@ -0,0 +1,417 @@ +# Troubleshooting + +Common issues and solutions when working with Git DRS. + +> **Navigation:** [Getting Started](getting-started.md) → [Commands Reference](commands.md) → **Troubleshooting** + +## When to Use Which Tool + +Understanding when to use Git, Git LFS, or Git DRS commands: + +### Git DRS Commands + +**Use for**: Repository and remote configuration + +- `git drs init` - Initialize Git LFS hooks +- `git drs remote add` - Configure DRS server connections +- `git drs remote list` - View configured remotes +- `git drs add-url` - Add S3 file references + +**When**: + +- Setting up a new repository +- Adding/managing DRS remotes +- Refreshing expired credentials +- Adding external file references + +### Git LFS Commands + +**Use for**: File tracking and management + +- `git lfs track` - Define which files to track +- `git lfs ls-files` - See tracked files and status +- `git lfs pull` - Download specific files +- `git lfs untrack` - Stop tracking file patterns + +**When**: + +- Managing which files are stored externally +- Downloading specific files +- Checking file localization status + +### Standard Git Commands + +**Use for**: Version control operations + +- `git add` - Stage files for commit +- `git commit` - Create commits +- `git push` - Upload commits and trigger file uploads +- `git pull` - Get latest commits + +**When**: + +- Normal development workflow +- Git DRS runs automatically in the background + +## Common Error Messages + +### Authentication Errors + +**Error**: `Upload error: 403 Forbidden` or `401 Unauthorized` + +**Cause**: Expired or invalid credentials + +**Solution**: + +```bash +# Download new credentials from your data commons +# Then refresh them by re-adding the remote +git drs remote add gen3 production \ + --cred /path/to/new-credentials.json \ + --url https://calypr-public.ohsu.edu \ + --project my-project \ + --bucket my-bucket +``` + +**Prevention**: + +- Credentials expire after 30 days +- Set a reminder to refresh them regularly + +--- + +**Error**: `Upload error: 503 Service Unavailable` + +**Cause**: DRS server is temporarily unavailable or credentials expired + +**Solutions**: + +1. Wait and retry the operation +2. Refresh credentials: + ```bash + git drs remote add gen3 production \ + --cred /path/to/credentials.json \ + --url https://calypr-public.ohsu.edu \ + --project my-project \ + --bucket my-bucket + ``` +3. If persistent, download new credentials from the data commons + +### Network Errors + +**Error**: `net/http: TLS handshake timeout` + +**Cause**: Network connectivity issues + +**Solution**: + +- Simply retry the command +- These are usually temporary network issues + +--- + +**Error**: Git push timeout during large file uploads + +**Cause**: Long-running operations timing out + +**Solution**: Add to `~/.ssh/config`: + +``` +Host github.com + TCPKeepAlive yes + ServerAliveInterval 30 +``` + +### File Tracking Issues + +**Error**: Files not being tracked by LFS + +**Symptoms**: + +- Large files committed directly to Git +- `git lfs ls-files` doesn't show your files + +**Solution**: + +```bash +# Check what's currently tracked +git lfs track + +# Track your file type +git lfs track "*.bam" +git add .gitattributes + +# Remove from Git and re-add +git rm --cached large-file.bam +git add large-file.bam +git commit -m "Track large file with LFS" +``` + +--- + +**Error**: `[404] Object does not exist on the server` + +**Symptoms**: + +- After clone, git pull fails + +**Solution**: + +```bash +# confirm repo has complete configuration +git drs remote list + +# init your git drs project +git drs init --cred /path/to/cred/file --profile + +# attempt git pull again +git lfs pull -I path/to/file +``` + +--- + +**Error**: `git lfs ls-files` shows files but they won't download + +**Cause**: Files may not have been properly uploaded or DRS records missing + +**Solution**: + +```bash +# Check repository status +git drs remote list + +# Try pulling with verbose output +git lfs pull -I "problematic-file*" --verbose + +# Check logs +cat .git/drs/*.log +``` + +### Configuration Issues + +**Error**: `git drs remote list` shows empty or incomplete configuration + +**Cause**: Repository not properly initialized or no remotes configured + +**Solution**: + +```bash +# Initialize repository if needed +git drs init + +# Add Gen3 remote +git drs remote add gen3 production \ + --cred /path/to/credentials.json \ + --url https://calypr-public.ohsu.edu \ + --project my-project \ + --bucket my-bucket + +# For AnVIL +git drs remote add anvil development --terraProject + +# Verify configuration +git drs remote list +``` + +--- + +**Error**: Configuration exists but commands fail + +**Cause**: Mismatched configuration between global and local settings, or expired credentials + +**Solution**: + +```bash +# Check configuration +git drs remote list + +# Refresh credentials by re-adding the remote +git drs remote add gen3 production \ + --cred /path/to/new-credentials.json \ + --url https://calypr-public.ohsu.edu \ + --project my-project \ + --bucket my-bucket +``` + +### Remote Configuration Issues + +**Error**: `no default remote configured` + +**Cause**: Repository initialized but no remotes added yet + +**Solution**: + +```bash +# Add your first remote (automatically becomes default) +git drs remote add gen3 production \ + --cred /path/to/credentials.json \ + --url https://calypr-public.ohsu.edu \ + --project my-project \ + --bucket my-bucket +``` + +--- + +**Error**: `default remote 'X' not found` + +**Cause**: Default remote was deleted or configuration is corrupted + +**Solution**: + +```bash +# List available remotes +git drs remote list + +# Set a different remote as default +git drs remote set staging + +# Or add a new remote +git drs remote add gen3 production \ + --cred /path/to/credentials.json \ + --url https://calypr-public.ohsu.edu \ + --project my-project \ + --bucket my-bucket +``` + +--- + +**Error**: Commands using wrong remote + +**Cause**: Default remote is not the one you want to use + +**Solution**: + +```bash +# Check current default +git drs remote list + +# Option 1: Change default remote +git drs remote set production + +# Option 2: Specify remote for single command +git drs push staging +git drs fetch production +``` + +## Undoing Changes + +### Untrack LFS Files + +If you accidentally tracked the wrong files: + +```bash +# See current tracking +git lfs track + +# Remove incorrect pattern +git lfs untrack "wrong-dir/**" + +# Add correct pattern +git lfs track "correct-dir/**" + +# Stage the changes +git add .gitattributes +git commit -m "Fix LFS tracking patterns" +``` + +### Undo Git Add + +Remove files from staging area: + +```bash +# Check what's staged +git status + +# Unstage specific files +git restore --staged file1.bam file2.bam + +# Unstage all files +git restore --staged . +``` + +### Undo Last Commit + +To retry a commit with different files: + +```bash +# Undo last commit, keep files in working directory +git reset --soft HEAD~1 + +# Or undo and unstage files +git reset HEAD~1 + +# Or completely undo commit and changes (BE CAREFUL!) +git reset --hard HEAD~1 +``` + +### Remove Files from LFS History + +If you committed large files directly to Git by mistake: + +```bash +# Remove from Git history (use carefully!) +git filter-branch --tree-filter 'rm -f large-file.dat' HEAD + +# Then track properly with LFS +git lfs track "*.dat" +git add .gitattributes +git add large-file.dat +git commit -m "Track large file with LFS" +``` + +## Diagnostic Commands + +### Check System Status + +```bash +# Git DRS version and help +git-drs version +git-drs --help + +# Configuration +git drs remote list + +# Repository status +git status +git lfs ls-files +``` + +### View Logs + +```bash +# Git DRS configuration +git drs remote list +``` + +### Test Connectivity + +```bash +# Test basic Git operations +git lfs pull --dry-run + +# Test DRS configuration +git drs remote list +``` + +## Getting Help + +### Log Analysis + +When reporting issues, include: + +```bash +# System information +git-drs version +git lfs version +git --version + +# Configuration +git drs remote list +``` + +## Prevention Best Practices + +1. **Test in small batches** - Don't commit hundreds of files at once +2. **Verify tracking** - Always check `git lfs ls-files` after adding files +3. **Use .gitignore** - Prevent accidental commits of temporary files +4. **Monitor repository size** - Keep an eye on `.git` directory size diff --git a/docs/tools/grip/clients.md b/docs/tools/grip/clients.md new file mode 100644 index 0000000..00abc55 --- /dev/null +++ b/docs/tools/grip/clients.md @@ -0,0 +1,141 @@ +--- +title: Client Library +menu: + main: + identifier: clients + weight: 25 +--- + + +# Getting Started + +GRIP has an API for making graph queries using structured data. Queries are defined using a series of step [operations](/docs/queries/operations). + +## Install the Python Client + +Available on [PyPI](https://pypi.org/project/gripql/). + +``` +pip install gripql +``` + +Or install the latest development version: + +``` +pip install "git+https://github.com/bmeg/grip.git#subdirectory=gripql/python" +``` + + +## Using the Python Client + +Let's go through the features currently supported in the python client. + +First, import the client and create a connection to an GRIP server: + +```python +import gripql +G = gripql.Connection("https://bmeg.io").graph("bmeg") +``` + +Some GRIP servers may require authorizaiton to access its API endpoints. The client can be configured to pass +authorization headers in its requests. + +```python +import gripql + +# Basic Auth Header - {'Authorization': 'Basic dGVzdDpwYXNzd29yZA=='} +G = gripql.Connection("https://bmeg.io", user="test", password="password").graph("bmeg") +# + +# Bearer Token - {'Authorization': 'Bearer iamnotarealtoken'} +G = gripql.Connection("https://bmeg.io", token="iamnotarealtoken").graph("bmeg") + +# OAuth2 / Custom - {"OauthEmail": "fake.user@gmail.com", "OauthAccessToken": "iamnotarealtoken", "OauthExpires": 1551985931} +G = gripql.Connection("https://bmeg.io", credential_file="~/.grip_token.json").graph("bmeg") +``` + +Now that we have a connection to a graph instance, we can use this to make all of our queries. + +One of the first things you probably want to do is find some vertex out of all of the vertexes available in the system. In order to do this, we need to know something about the vertex we are looking for. To start, let's see if we can find a specific gene: + +```python +result = G.V().hasLabel("Gene").has(gripql.eq("symbol", "TP53")).execute() +print(result) +``` + +A couple things about this first and simplest query. We start with `O`, our grip client instance connected to the "bmeg" graph, and create a new query with ``. This query is now being constructed. You can chain along as many operations as you want, and nothing will actually get sent to the server until you print the results. + +Once we make this query, we get a result: + +```python +[ + { + u'_id': u'ENSG00000141510', + u'_label': u'Gene' + u'end': 7687550, + u'description': u'tumor protein p53 [Source:HGNC Symbol%3BAcc:HGNC:11998]', + u'symbol': u'TP53', + u'start': 7661779, + u'seqId': u'17', + u'strand': u'-', + u'id': u'ENSG00000141510', + u'chromosome': u'17' + } +] +``` + +This represents the vertex we queried for above. All vertexes in the system will have a similar structure, basically: + +* _\_id_: This represents the global identifier for this vertex. In order to draw edges between different vertexes from different data sets we need an identifier that can be constructed from available data. Often, the `_id` will be the field that you query on as a starting point for a traversal. +* _\_label_: The label represents the type of the vertex. All vertexes with a given label will share many property keys and edge labels, and form a logical group within the system. + +The data on a query result can be accessed as properties on the result object; for example `result[0].data.symbol` would return: + +```python +u'TP53' +``` + +You can also do a `has` query with a list of items using `gripql.within([...])` (other conditions exist, see the `Conditions` section below): + +```python +result = G.V().hasLabel("Gene").has(gripql.within("symbol", ["TP53", "BRCA1"])).render({"_id": "_id", "symbol":"symbol"}).execute() +print(result) +``` + +This returns both Gene vertexes: + +``` +[ + {u'symbol': u'TP53', u'_id': u'ENSG00000141510'}, + {u'symbol': u'BRCA1', u'_id': u'ENSG00000012048'} +] +``` + +Once you are on a vertex, you can travel through that vertex's edges to find the vertexes it is connected to. Sometimes you don't even need to go all the way to the next vertex, the information on the edge between them may be sufficient. + +Edges in the graph are directional, so there are both incoming and outgoing edges from each vertex, leading to other vertexes in the graph. Edges also have a _label_, which distinguishes the kind of connections different vertexes can have with one another. + +Starting with gene TP53, and see what kind of other vertexes it is connected to. + +```python +result = G.V().hasLabel("Gene").has(gripql.eq("symbol", "TP53")).in_("TranscriptFor")render({"id": "_id", "label":"_label"}).execute() +print(result) +``` + +Here we have introduced a couple of new steps. The first is `.in_()`. This starts from wherever you are in the graph at the moment and travels out along all the incoming edges. +Additionally, we have provided `TranscriptFor` as an argument to `.in_()`. This limits the returned vertices to only those connected to the `Gene` verticies by edges labeled `TranscriptFor`. + + +``` +[ + {u'_label': u'Transcript', u'_id': u'ENST00000413465'}, + {u'_label': u'Transcript', u'_id': u'ENST00000604348'}, + ... +] +``` + +View a list of all available query operations [here](/docs/queries/operations). + +### Using the command line + +Grip command line syntax is defined at gripql/javascript/gripql.js diff --git a/docs/tools/grip/commands/create.md b/docs/tools/grip/commands/create.md new file mode 100644 index 0000000..bde4aa8 --- /dev/null +++ b/docs/tools/grip/commands/create.md @@ -0,0 +1,27 @@ + +--- +title: create + +menu: + main: + parent: commands + weight: 2 +--- + +# `create` + +## Usage + +```bash +gripql-cli create --host +``` + +- ``: The name of the graph to be created (required). +- `--host `: The URL of the GripQL server (default is "localhost:8202"). + +## Example + +```bash +gripql-cli create my_new_graph --host myserver.com:8202 +``` + diff --git a/docs/tools/grip/commands/delete.md b/docs/tools/grip/commands/delete.md new file mode 100644 index 0000000..e4deeab --- /dev/null +++ b/docs/tools/grip/commands/delete.md @@ -0,0 +1,42 @@ +--- +title: delete +menu: + main: + parent: commands + weight: 3 +--- + +# `delete` Command + +## Usage + +```bash +gripql-cli delete --host --file --edges --vertices +``` + +### Options + +- ``: Name of the graph (required) +- `--host `: GripQL server URL (default: "localhost:8202") +- `--file `: Path to a JSON file containing data to delete +- `--edges `: Comma-separated list of edge IDs to delete (ignored if JSON file is provided) +- `--vertices `: Comma-separated list of vertex IDs to delete (ignored if JSON file is provided) + +## Example + +```bash +gripql-cli delete my_graph --host myserver.com:8202 --edges edge1,edge2 --vertices vertex3,vertex4 +``` + +## JSON File Format + +JSON file format for data to be deleted: + +```json +{ + "graph": "graph_name", + "edges": ["list of edge ids"], + "vertices": ["list of vertex ids"] +} +``` + diff --git a/docs/tools/grip/commands/drop.md b/docs/tools/grip/commands/drop.md new file mode 100644 index 0000000..c43aa0b --- /dev/null +++ b/docs/tools/grip/commands/drop.md @@ -0,0 +1,14 @@ +--- +title: drop + +menu: + main: + parent: commands + weight: 4 +--- + +``` +grip drop +``` + +Deletes a graph. diff --git a/docs/tools/grip/commands/er.md b/docs/tools/grip/commands/er.md new file mode 100644 index 0000000..28d34ee --- /dev/null +++ b/docs/tools/grip/commands/er.md @@ -0,0 +1,49 @@ +--- +title: er + +menu: + main: + parent: commands + weight: 6 +--- + +``` +grip er +``` + +The *External Resource* system allows GRIP to plug into existing data systems and +integrate them into queriable graphs. The `grip er` sub command acts as a client +to the external resource plugin proxies, issues command and displays the results. +This is often useful for debugging external resources before making them part of +an actual graph. + + +List collections provided by external resource +``` +grip er list +``` + +Get info about a collection +``` +grip er info +``` + +List ids from a collection +``` +grip er ids +``` + +List rows from a collection +``` +grip er rows +``` + +List rows with field match +``` +grip get +``` + +List rows with field match +``` +grip er query +``` diff --git a/docs/tools/grip/commands/list.md b/docs/tools/grip/commands/list.md new file mode 100644 index 0000000..f018b6a --- /dev/null +++ b/docs/tools/grip/commands/list.md @@ -0,0 +1,40 @@ +--- +title: list + +menu: + main: + parent: commands + weight: 3 +--- + +The `list tables` command is used to display all available tables in the grip server. Each table is represented by its source, name, fields, and link map. Here's a breakdown of how to use this command: + +- **Usage:** `gripql list tables` +- **Short Description:** List all available tables in the grip server. +- **Long Description:** This command connects to the grip server and retrieves information about all available tables. It then prints each table's source, name, fields, and link map to the console. +- **Arguments:** None +- **Flags:** + - `--host`: The URL of the grip server (default: "localhost:8202") + +## `gripql list graphs` Command Documentation + +The `list graphs` command is used to display all available graphs in the grip server. Here's a breakdown of how to use this command: + +- **Usage:** `gripql list graphs` +- **Short Description:** List all available graphs in the grip server. +- **Long Description:** This command connects to the grip server and retrieves information about all available graphs. It then prints each graph's name to the console. +- **Arguments:** None +- **Flags:** + - `--host`: The URL of the grip server (default: "localhost:8202") + +## `gripql list labels` Command Documentation + +The `list labels` command is used to display all available vertex and edge labels in a specific graph. Here's a breakdown of how to use this command: + +- **Usage:** `gripql list labels ` +- **Short Description:** List the vertex and edge labels in a specific graph. +- **Long Description:** This command takes one argument, the name of the graph, and connects to the grip server. It retrieves information about all available vertex and edge labels in that graph and prints them to the console in JSON format. +- **Arguments:** + - ``: The name of the graph to list labels for. +- **Flags:** + - `--host`: The URL of the grip server (default: "localhost:8202") \ No newline at end of file diff --git a/docs/tools/grip/commands/mongoload.md b/docs/tools/grip/commands/mongoload.md new file mode 100644 index 0000000..e9372d7 --- /dev/null +++ b/docs/tools/grip/commands/mongoload.md @@ -0,0 +1,12 @@ +--- +title: mongoload + +menu: + main: + parent: commands + weight: 4 +--- + +``` +grip mongoload +``` diff --git a/docs/tools/grip/commands/query.md b/docs/tools/grip/commands/query.md new file mode 100644 index 0000000..bb4f7ec --- /dev/null +++ b/docs/tools/grip/commands/query.md @@ -0,0 +1,20 @@ +--- +title: query + +menu: + main: + parent: commands + weight: 2 +--- + +``` +grip query +``` + +Run a query on a graph. + +Examples +```bash +grip query pc12 'V().hasLabel("Pathway").count()' +``` + diff --git a/docs/tools/grip/commands/server.md b/docs/tools/grip/commands/server.md new file mode 100644 index 0000000..fb7e82a --- /dev/null +++ b/docs/tools/grip/commands/server.md @@ -0,0 +1,53 @@ +--- +title: server +menu: + main: + parent: commands + weight: 1 +--- + +# `server` +The server command starts up a graph server and waits for incoming requests. + +## Default Configuration +If invoked with no arguments or config files, GRIP will start up in embedded mode, using a Badger based graph driver. + +## Networking +By default the GRIP server operates on 2 ports, `8201` is the HTTP based interface. Port `8202` is a GRPC based interface. Python, R and Javascript clients are designed to connect to the HTTP interface on `8201`. The `grip` command will often use port `8202` in order to complete operations. For example if you call `grip list graphs` it will contact port `8202`, rather then using the HTTP port. This means that if you are working with a server that is behind a firewall, and only the HTTP port is available, then the grip command line program will not be able to issue commands, even if the server is visible to client libraries. + +## CLI Usage +The `server` command can take several flags for configuration: +- `--config` or `-c` - Specifies a YAML config file with server settings. This overwrites all other settings. Defaults to "" (empty string). +- `--http-port` - Sets the port used by the HTTP interface. Defaults to "8201". +- `--rpc-port` - Sets the port used by the GRPC interface. Defaults to "8202". +- `--read-only` - Start server in read-only mode. Defaults to false. +- `--log-level` or `--log-format` - Set logging level and format, respectively. Defaults are "info" for log level and "text" for format. +- `--log-requests` - Log all requests. Defaults to false. +- `--verbose` - Sets the log level to debug if true. +- `--plugins` or `-p` - Specifies a directory with GRIPPER plugins to load. If not specified, no plugins will be loaded by default. +- `--driver` or `-d` - Specifies the default driver for graph storage. Defaults to "badger". Other possible options are: "pebble", "mongo", "grids", and "sqlite". +- `--endpoint` or `-w` - Load a web endpoint plugin. Use multiple times to load multiple plugins. The format is key=value where key is the plugin name and value is the configuration string for the plugin. +- `--endpoint-config` or `-l` - Configure a loaded web endpoint plugin. Use multiple times to configure multiple plugins. The format is key=value where key is in the form 'pluginname:key' and value is the configuration value for that key. +- `--er` or `-e` - Set GRIPPER source addresses. This flag can be used multiple times to specify multiple addresses. Defaults to an empty map. + +## Examples + +```bash +# Load server with a specific config file +grip server --config /path/to/your_config.yaml + +# Set the HTTP port to 9001 +grip server --http-port 9001 + +# Start in read-only mode +grip server --read-only + +# Enable verbose logging (sets log level to debug) +grip server --verbose + +# Load a web endpoint plugin named 'foo' with configuration string 'config=value' +grip server --endpoint foo=config=value + +# Configure the loaded 'foo' web endpoint plugin, setting its key 'key1' to value 'val1' +grip server --endpoint-config foo:key1=val1 +``` diff --git a/docs/tools/grip/databases.md b/docs/tools/grip/databases.md new file mode 100644 index 0000000..3ad6b01 --- /dev/null +++ b/docs/tools/grip/databases.md @@ -0,0 +1,120 @@ +--- +title: Database Configuration +menu: + main: + identifier: Databases + weight: 20 +--- + + +# Embedded Key Value Stores + +GRIP supports storing vertices and edges in a variety of key-value stores including: + + * [Pebble](https://github.com/cockroachdb/pebble) + * [Badger](https://github.com/dgraph-io/badger) + * [BoltDB](https://github.com/boltdb/bolt) + * [LevelDB](https://github.com/syndtr/goleveldb) + +Config: + +```yaml +Default: kv + +Driver: + kv: + Badger: grip.db +``` + +---- + +# MongoDB + +GRIP supports storing vertices and edges in [MongoDB][mongo]. + +Config: + +```yaml +Default: mongo + +Drivers: + mongo: + MongoDB: + URL: "mongodb://localhost:27000" + DBName: "gripdb" + Username: "" + Password: "" + UseCorePipeline: False + BatchSize: 0 +``` + +[mongo]: https://www.mongodb.com/ + +`UseCorePipeline` - Default is to use Mongo pipeline API to do graph traversals. +By enabling `UseCorePipeline`, GRIP will do the traversal logic itself, only using +Mongo for graph storage. + +`BatchSize` - For core engine operations, GRIP dispatches element lookups in +batches to minimize query overhead. If missing from config file (which defaults to 0) +the engine will default to 1000. + +---- + + +# GRIDS + +This is an indevelopment high performance graph storage system. + +Config: + +```yaml +Default: db + +Drivers: + db: + Grids: grip-grids.db + +``` + +---- + +# PostgreSQL + +GRIP supports storing vertices and edges in [PostgreSQL][psql]. + +Config: + +```yaml +Default: psql + +Drivers: + psql: + PSQL: + Host: localhost + Port: 15432 + User: "" + Password: "" + DBName: "grip" + SSLMode: disable +``` + +[psql]: https://www.postgresql.org/ + +--- + +# SQLite + +GRIP supports storing vertices and edges in [SQLite] + +Config: + +```yaml +Default: sqlite + +Drivers: + sqlite: + Sqlite: + DBName: tester/sqliteDB +``` + +[psql]: https://sqlite.org/ diff --git a/docs/tools/grip/developer/architecture.d2 b/docs/tools/grip/developer/architecture.d2 new file mode 100644 index 0000000..ecbd726 --- /dev/null +++ b/docs/tools/grip/developer/architecture.d2 @@ -0,0 +1,90 @@ + + +gripql-python: "gripql/python" { + text: |md +# gripql + +Python library +| +} + +gripql-python -> gripql.http + +grip-client : "cmd/" { + graph { + create + drop + stream + list + schema + } + + data { + kvload + load + dump + mongoload + query + delete + } + + config { + mapping + plugin + info + } + + jobs { + job + } +} + +grip-client -> gripql.grpc + +gripql : "gripql/" { + + text: |md +Protobuf defined code +| + grpc + grpc-gateway + + http -> grpc-gateway + grpc-gateway -> grpc : protobuf via network + + http -> grpc-dgw +} + + +gripql.grpc -> server +gripql.grpc-dgw -> server + +server : "server/" { + +} + +server -> pipeline + +pipeline { + gripql-parser + compiler +} + +gdbi { + mongo + mongo-core + pebble +} + +pipeline.compiler -> gdbi + +server -> jobs + +jobs { + store + search + drivers : { + opensearch + flat file + } +} \ No newline at end of file diff --git a/docs/tools/grip/graphql/graph_schemas.md b/docs/tools/grip/graphql/graph_schemas.md new file mode 100644 index 0000000..f68bff4 --- /dev/null +++ b/docs/tools/grip/graphql/graph_schemas.md @@ -0,0 +1,37 @@ +--- +title: Graph Schemas +menu: + main: + parent: graphql + weight: 30 +--- + +# Graph Schemas + +Most GRIP based graphs are not required to have a strict schema. However, GraphQL requires +a graph schema as part of it's API. To utilize the GraphQL endpoint, there must be a +Graph Schema provided to be used by the GRIP engine to determine how to render a GraphQL endpoint. +Graph schemas are themselves an instance of a graph. As such, they can be traversed like any other graph. +The schemas are automatically added to the database following the naming pattern. `{graph-name}__schema__` + +## Get the Schema of a Graph + +The schema of a graph can be accessed via a GET request to `/v1/graph/{graph-name}/schema` + +Alternatively, you can use the grip CLI. `grip schema get {graph-name}` + +## Post a graph schema + +A schema can be attached to an existing graph via a POST request to `/v1/graph/{graph-name}/schema` + +Alternatively, you can use the grip CLI. `grip schema post [graph_name] --jsonSchema {file}` + +Schemas must be loaded as a json file in JSON schema format. see [jsonschema](https://json-schema.org/) spec for more details + +## Raw bulk loading + +Once a schema is attached to a graph, raw json records can be loaded directly to grip without having to be in native grip vertex/edge format. +Schema validation is enforced when using this POST `/v1/rawJson` method. + +A grip CLI alternative is also available with `grip jsonload [ndjson_file_path] [graph_name]` +See https://github.com/bmeg/grip/blob/develop/conformance/tests/ot_bulk_raw.py for a full example using gripql python package. diff --git a/docs/tools/grip/graphql/graphql.md b/docs/tools/grip/graphql/graphql.md new file mode 100644 index 0000000..1de2f34 --- /dev/null +++ b/docs/tools/grip/graphql/graphql.md @@ -0,0 +1,29 @@ +--- +title: GraphQL +menu: + main: + parent: graphql + weight: 25 +--- + +# GraphQL + +Grip graphql tools are defined as go standard library plugins and are located at https://github.com/bmeg/grip-graphql. +A schema based approach was used for defining read plugins. + +## Json Schema + +grip also supports using jsonschema with hypermedia extensions. Given an existing graph called TEST + +``` +./grip schema post TEST --jsonSchema schema.json +``` + +This schema will attach to the TEST graph, and subsequent calls to the bulkAddRaw method with raw Json +as defined by the attached the jsonschema will load directly into grip. + +see conformance/tests/ot_bulk_raw.py for an example + +## Future work + +In the future, autogenerated json schema may be added back to grip to continue to support graphql queries. Currently there is not support for graphql in base Grip without using the plugin repos specified above. diff --git a/docs/tools/grip/gripper/graphmodel.md b/docs/tools/grip/gripper/graphmodel.md new file mode 100644 index 0000000..cd678cf --- /dev/null +++ b/docs/tools/grip/gripper/graphmodel.md @@ -0,0 +1,255 @@ +--- +title: Graph Model + +menu: + main: + parent: gripper + weight: 3 +--- + +# GRIPPER + +GRIP Plugable External Resources + +## Graph Model + +The graph model describes how GRIP will access multiple gripper servers. The mapping +of these data resources is done using a graph. The `vertices` represent how each vertex +type will be mapped, and the `edges` describe how edges will be created. The `_id` +of each vertex represents the prefix domain of all vertices that can be found in that +source. + +The `sources` referenced by the graph are provided to GRIP at run time, each named resource is a +different GRIPPER plugin that abstracts an external resource. +The `vertices` section describes how different collections +found in these sources will be turned into Vertex found in the graph. Finally, the +`edges` section describes the different kinds of rules that can be used build the +edges in the graph. + +Edges can be built from two rules `fieldToField` and `edgeTable`. In `fieldToField`, +a field value found in one vertex can be used to look up matching destination vertices +by using an indexed field found in another collection that has been mapped to a vertex. +For `edgeTable` connections, there is a single collection that represents a connection between +two other collections that have been mapped to vertices. + +## Runtime External Resource Config + +External resources are passed to GRIP as command line options. For the command line: + +``` +grip server config.yaml --er tableServer=localhost:50051 --er pfb=localhost:50052 +``` + +`tableServer` is a ER plugin that serves table data (see `gripper/test-graph`) +while `pfb` parses PFB based files (see https://github.com/bmeg/grip_pfb ) + +The `config.yaml` is + +``` +Default: badger + +Drivers: + badger: + Badger: grip-badger.db + + swapi-driver: + Gripper: + ConfigFile: ./swapi.yaml + Graph: swapi + +``` + +This runs with a default `badger` based driver, but also provides a GRIPPER based +graph from the `swapi` mapping (see example graph map below). + +## Example graph map + +``` +vertices: + - _id: "Character:" + _label: Character + source: tableServer + collection: Character + + - _id: "Planet:" + _label: Planet + collection: Planet + source: tableServer + + - _id: "Film:" + _label: Film + collection: Film + source: tableServer + + - _id: "Species:" + _label: Species + source: tableServer + collection: Species + + - _id: "Starship:" + _label: Starship + source: tableServer + collection: Starship + + - _id: "Vehicle:" + _label: Vehicle + source: tableServer + collection: Vehicle + +edges: + - _id: "homeworld" + _from: "Character:" + _to: "Planet:" + _label: homeworld + fieldToField: + fromField: $.homeworld + toField: $.id + + - _id: species + _from: "Character:" + _to: "Species:" + _label: species + fieldToField: + fromField: $.species + toField: $.id + + - _id: people + _from: "Species:" + _to: "Character:" + _label: people + edgeTable: + source: tableServer + collection: speciesCharacter + fromField: $.from + toField: $.to + + - _id: residents + _from: "Planet:" + _to: "Character:" + _label: residents + edgeTable: + source: tableServer + collection: planetCharacter + fromField: $.from + toField: $.to + + - _id: filmVehicles + _from: "Film:" + _to: "Vehicle:" + _label: "vehicles" + edgeTable: + source: tableServer + collection: filmVehicles + fromField: "$.from" + toField: "$.to" + + - _id: vehicleFilms + _to: "Film:" + _from: "Vehicle:" + _label: "films" + edgeTable: + source: tableServer + collection: filmVehicles + toField: "$.from" + fromField: "$.to" + + - _id: filmStarships + _from: "Film:" + _to: "Starship:" + _label: "starships" + edgeTable: + source: tableServer + collection: filmStarships + fromField: "$.from" + toField: "$.to" + + - _id: starshipFilms + _to: "Film:" + _from: "Starship:" + _label: "films" + edgeTable: + source: tableServer + collection: filmStarships + toField: "$.from" + fromField: "$.to" + + - _id: filmPlanets + _from: "Film:" + _to: "Planet:" + _label: "planets" + edgeTable: + source: tableServer + collection: filmPlanets + fromField: "$.from" + toField: "$.to" + + - _id: planetFilms + _to: "Film:" + _from: "Planet:" + _label: "films" + edgeTable: + source: tableServer + collection: filmPlanets + toField: "$.from" + fromField: "$.to" + + - _id: filmSpecies + _from: "Film:" + _to: "Species:" + _label: "species" + edgeTable: + source: tableServer + collection: filmSpecies + fromField: "$.from" + toField: "$.to" + + - _id: speciesFilms + _to: "Film:" + _from: "Species:" + _label: "films" + edgeTable: + source: tableServer + collection: filmSpecies + toField: "$.from" + fromField: "$.to" + + - _id: filmCharacters + _from: "Film:" + _to: "Character:" + _label: characters + edgeTable: + source: tableServer + collection: filmCharacters + fromField: "$.from" + toField: "$.to" + + - _id: characterFilms + _from: "Character:" + _to: "Film:" + _label: films + edgeTable: + source: tableServer + collection: filmCharacters + toField: "$.from" + fromField: "$.to" + + - _id: characterStarships + _from: "Character:" + _to: "Starship:" + _label: "starships" + edgeTable: + source: tableServer + collection: characterStarships + fromField: "$.from" + toField: "$.to" + + - _id: starshipCharacters + _to: "Character:" + _from: "Starship:" + _label: "pilots" + edgeTable: + source: tableServer + collection: characterStarships + toField: "$.from" + fromField: "$.to" +``` diff --git a/docs/tools/grip/gripper/gripper.md b/docs/tools/grip/gripper/gripper.md new file mode 100644 index 0000000..ab41e83 --- /dev/null +++ b/docs/tools/grip/gripper/gripper.md @@ -0,0 +1,22 @@ +--- +title: Intro + +menu: + main: + parent: gripper + weight: 1 +--- + +# GRIPPER +## GRIP Plugin External Resources + +GRIP Plugin External Resources (GRIPPERs) are GRIP drivers that take external +resources and allow GRIP to access them are part of a unified graph. +To integrate new resources into the graph, you +first deploy griper proxies that plug into the external resources. They are unique +and configured to access specific resources. These provide a view into external +resources as a series of document collections. For example, an SQL gripper would +plug into an SQL server and provide the tables as a set of collections with each +every row a document. A gripper is written as a gRPC server. + +![GIPPER Architecture](/img/gripper_architecture.png) diff --git a/docs/tools/grip/gripper/proxy.md b/docs/tools/grip/gripper/proxy.md new file mode 100644 index 0000000..e232715 --- /dev/null +++ b/docs/tools/grip/gripper/proxy.md @@ -0,0 +1,50 @@ +--- +title: External Resource Proxies + +menu: + main: + parent: gripper + weight: 2 +--- + +# GRIPPER + +## GRIPPER proxy + +With the external resources normalized to a single data model, the graph model +describes how to connect the set of collections into a graph model. Each GRIPPER +is required to provide a GRPC interface that allows access to collections stored +in the resource. + +The required functions include: + +``` +rpc GetCollections(Empty) returns (stream Collection); +``` +`GetCollections` returns a list of all of the Collections accessible via this server. + +``` +rpc GetCollectionInfo(Collection) returns (CollectionInfo); +``` +`GetCollectionInfo` provides information, such as the list of indexed fields, in a collection. + +``` +rpc GetIDs(Collection) returns (stream RowID); +``` +`GetIDs` returns a stream of all of the IDs found in a collection. + +``` +rpc GetRows(Collection) returns (stream Row); +``` +`GetRows` returns a stream of all of the rows in a collection. + +``` +rpc GetRowsByID(stream RowRequest) returns (stream Row); +``` +`GetRowsByID` accepts a stream of row requests, each one requesting a single row +by it's id, and then returns a stream of results. + +``` +rpc GetRowsByField(FieldRequest) returns (stream Row); +``` +`GetRowsByField` searches a collection, looking for values found in an indexed field. diff --git a/docs/tools/grip/index.md b/docs/tools/grip/index.md new file mode 100644 index 0000000..b89a232 --- /dev/null +++ b/docs/tools/grip/index.md @@ -0,0 +1,42 @@ +--- +title: GRIP +--- + +GRIP (Graph Resource Integration Platform) is a powerful framework for building and managing distributed data processing systems. Key features include: + +- **Distributed Computing**: Scalable processing across multiple nodes. +- **Database Integration**: Built-in support for MongoDB, PostgreSQL, and SQL databases. +- **API Endpoints**: RESTful APIs for managing data workflows and monitoring. +- **Flexible Query Language**: GRIPQL for complex data queries and transformations. +- **Job Management**: Schedule, monitor, and manage data processing jobs in real-time. + + + +``` +# Start server +$ grip server --config grip.yml + +# List all graphs +$ grip list + +# Create a graph +$ grip create example + +# Drop a graph +$ grip drop example + +# Load data into a graph +$ grip load example --edge edges.txt --vertex vertices.txt + +# Query a graph +$ grip query example 'V().hasLabel("users")' + +#Get vertex/edge counts for a graph +$ grip info example + +# Get the schema for a graph +$ grip schema get example + +# Dump vertices/edges from a graph +$ grip dump example --vertex +``` \ No newline at end of file diff --git a/docs/tools/grip/jobs_api.md b/docs/tools/grip/jobs_api.md new file mode 100644 index 0000000..7589015 --- /dev/null +++ b/docs/tools/grip/jobs_api.md @@ -0,0 +1,63 @@ +--- +title: Jobs API +menu: + main: + identifier: Jobs + weight: 40 +--- + +# Jobs API + +Not all queries return instantaneously, additionally some queries elements are used +repeatedly. The query Jobs API provides a mechanism to submit graph traversals +that will be evaluated asynchronously and can be retrieved at a later time. + + +### Submitting a job + +``` +job = G.V().hasLabel("Planet").out().submit() +``` + +### Getting job status +``` +jinfo = G.getJob(job["id"]) +``` + +Example job info: +```json +{ + "id": "job-326392951", + "graph": "test_graph_qd7rs7", + "state": "COMPLETE", + "count": "12", + "query": [{"v": []}, {"hasLabel": ["Planet"]}, {"as": "a"}, {"out": []}], + "timestamp": "2021-03-30T23:12:01-07:00" +} +``` + +### Reading job results +``` +for row in G.readJob(job["id"]): + print(row) +``` + +### Search for jobs + +Find jobs that match the prefix of the current request (example should find job from G.V().hasLabel("Planet").out()) + +``` +jobs = G.V().hasLabel("Planet").out().out().count().searchJobs() +``` + +If there are multiple jobs that match the prefix of the search, all of them will be returned. It will be a client side +job to decide which of the jobs to use as a starting point. This can either be the job with the longest matching prefix, or +the most recent job. Note, that if the underlying database has changed since the job was run, adding additional steps to the +traversal may produce inaccurate results. + +Once `job` has been selected from the returned list you can use these existing results and continue the traversal. + +``` +for res in G.resume(job["id"]).out().count(): + print(res) +``` diff --git a/docs/tools/grip/queries/aggregation.md b/docs/tools/grip/queries/aggregation.md new file mode 100644 index 0000000..0012a81 --- /dev/null +++ b/docs/tools/grip/queries/aggregation.md @@ -0,0 +1,84 @@ +--- +title: Aggregation +menu: + main: + parent: Queries + weight: 6 +--- + +# Aggregation + +These methods provide a powerful way to analyze and summarize data in your GripQL graph database. They allow you to perform various types of aggregations, including term frequency, histograms, percentiles, and more. By combining these with other traversal functions like `has`, `hasLabel`, etc., you can create complex queries that extract specific insights from your data. + +## `.aggregate([aggregations])` +Groups and summarizes data from the graph. It allows you to perform calculations on vertex or edge properties. The following aggregation types are available: + +## Aggregation Types +### `.gripql.term(name, field, size)` +Return top n terms and their counts for a field. +```python +G.V().hasLabel("Person").aggregate(gripql.term("top-names", "name", 10)) +``` +Counts `name` occurrences across `Person` vertices and returns the 10 most frequent `name` values. + +### `.gripql.histogram(name, field, interval)` +Return binned counts for a field. +```python +G.V().hasLabel("Person").aggregate(gripql.histogram("age-hist", "age", 5)) +``` +Creates a histogram of `age` values with bins of width 5 across `Person` vertices. + +### `.gripql.percentile(name, field, percents=[])` +Return percentiles for a field. +```python +G.V().hasLabel("Person").aggregate(gripql.percentile("age-percentiles", "age", [25, 50, 75])) +``` +Calculates the 25th, 50th, and 75th percentiles for `age` values across `Person` vertices. + +### `.gripql.field("fields", "$")` +Returns all of the fields found in the data structure. Use `$` to get a listing of all fields found at the root level of the `data` property of vertices or edges. + +--- + +## `.count()` +Returns the total number of elements in the traversal. +```python +G.V().hasLabel("Person").count() +``` +This query returns the total number of vertices with the label "Person". + +--- + +## `.distinct([fields])` +Filters the traversal to return only unique elements. If `fields` are provided, uniqueness is determined by the combination of values in those fields; otherwise, the `_id` is used. +```python +G.V().hasLabel("Person").distinct(["name", "age"]) +``` +This query returns only unique "Person" vertices, where uniqueness is determined by the combination of "name" and "age" values. + +--- + +## `.sort([fields])` +Sort the output using the field values. You can sort in ascending or descending order by providing `descending=True` as an argument to `sort()` method. +```python +G.V().hasLabel("Person").sort("age") +``` +This query sorts "Person" vertices based on their age in ascending order. + +## `.limit(n)` +Limits the number of results returned by your query. +```python +G.V().hasLabel("Person").limit(10) +``` +This query limits the results to the first 10 "Person" vertices found. + +--- + +## `.skip(n)` +Offsets the results returned by your query. +```python +G.V().hasLabel("Person").skip(5) +``` +This query skips the first 5 "Person" vertices and returns the rest. + + diff --git a/docs/tools/grip/queries/filtering.md b/docs/tools/grip/queries/filtering.md new file mode 100644 index 0000000..2baac70 --- /dev/null +++ b/docs/tools/grip/queries/filtering.md @@ -0,0 +1,151 @@ +--- +title: Filtering +menu: + main: + parent: Queries + weight: 4 +--- + +# Filtering in GripQL + +GripQL provides powerful filtering capabilities using the .has() method and various condition functions. +Here's a comprehensive guide:.has()The .has() method is used to filter elements (vertices or edges) based on specified conditions. + +Conditions are functions provided by the gripql module that define the filtering criteria. + +## Comparison Operators + +### `gripql.eq(variable, value)` +Equal to (==) + +``` +G.V().has(gripql.eq("symbol", "TP53")) +# Returns vertices where the 'symbol' property is equal to 'TP53'. +``` + +### `gripql.neq(variable, value)` +Not equal to (!=) + +``` +G.V().has(gripql.neq("symbol", "TP53")) +# Returns vertices where the 'symbol' property is not equal to 'TP53'. +``` + +### `gripql.gt(variable, value)` +Greater than (>) + +``` +G.V().has(gripql.gt("age", 45)) +# Returns vertices where the 'age' property is greater than 45. +``` + +### `gripql.lt(variable, value)` +Less than (<) +``` +G.V().has(gripql.lt("age", 45)) +# Returns vertices where the 'age' property is less than 45. +``` + +### `gripql.gte(variable, value)` +Greater than or equal to (>=) +``` +G.V().has(gripql.gte("age", 45)) +# Returns vertices where the 'age' property is greater than or equal to 45. +``` + +### `gripql.lte(variable, value)` +Less than or equal to (<=) + +``` +G.V().has(gripql.lte("age", 45)) +# Returns vertices where the 'age' property is less than or equal to 45. +``` + +--- + +## Range Operators + +### `gripql.inside(variable, [lower_bound, upper_bound])` +lower_bound < variable < upper_bound (exclusive) + +``` +G.V().has(gripql.inside("age", [30, 45])) +# Returns vertices where the 'age' property is greater than 30 and less than 45. +``` + +### `gripql.outside(variable, [lower_bound, upper_bound])` +variable < lower_bound OR variable > upper_bound + +``` +G.V().has(gripql.outside("age", [30, 45])) +# Returns vertices where the 'age' property is less than 30 or greater than 45. +``` + +### `gripql.between(variable, [lower_bound, upper_bound])` +lower_bound <= variable < upper_bound + +``` +G.V().has(gripql.between("age", [30, 45])) +# Returns vertices where the 'age' property is greater than or equal to 30 and less than 45. +``` + +--- + +## Set Membership Operators + +### `gripql.within(variable, values)` +variable is in values + +``` +G.V().has(gripql.within("symbol", ["TP53", "BRCA1"])) +# Returns vertices where the 'symbol' property is either 'TP53' or 'BRCA1'. +``` + +### `gripql.without(variable, values)` +variable is not in values + +``` +G.V().has(gripql.without("symbol", ["TP53", "BRCA1"])) +# Returns vertices where the 'symbol' property is neither 'TP53' nor 'BRCA1'. +``` + +--- + +## String/Array Containment + +### `gripql.contains(variable, value)` +The variable (which is typically a list/array) contains value. + +``` +G.V().has(gripql.contains("groups", "group1")) +# Returns vertices where the 'groups' property (which is a list) contains the value "group1". +# Example: {"groups": ["group1", "group2", "group3"]} would match. +``` + +--- + +## Logical Operators + +### `gripql.and_([condition1, condition2, ...])` +Logical AND; all conditions must be true. + +``` +G.V().has(gripql.and_([gripql.lte("age", 45), gripql.gte("age", 35)])) +# Returns vertices where the 'age' property is less than or equal to 45 AND greater than or equal to 35. +``` + +### `gripql.or_([condition1, condition2, ...])` +Logical OR; at least one condition must be true. + +``` +G.V().has(gripql.or_([gripql.eq("symbol", "TP53"), gripql.eq("symbol", "BRCA1")])) +# Returns vertices where the 'symbol' property is either 'TP53' OR 'BRCA1'. +``` + +### `gripql.not_(condition)` +Logical NOT; negates the condition + +``` +G.V().has(gripql.not_(gripql.eq("symbol", "TP53"))) +# Returns vertices where the 'symbol' property is NOT equal to 'TP53'. +``` diff --git a/docs/tools/grip/queries/iterations.md b/docs/tools/grip/queries/iterations.md new file mode 100644 index 0000000..5af26c9 --- /dev/null +++ b/docs/tools/grip/queries/iterations.md @@ -0,0 +1,61 @@ +--- +title: Iteration +menu: + main: + parent: Queries + weight: 16 +--- + +# Iteration Commands + +A common operation in graph search is the ability to iteratively repeat a search pattern. For example, a 'friend of a friend' search may become a 'friend of a friend of a friend' search. In the GripQL language cycles, iterations and conditional operations are encoded using 'mark' and 'jump' based interface. This operation is similar to using a 'goto' statement in traditional programming languages. While more primitive than the repeat mechanisms seen in Gremlin, this pattern allows for much simpler query compilation and implementation. + +However, due to security concerns regarding potential denial of service attacks that could be created with the use of 'mark' and 'jump', these operations are restricted in most accounts. This is enforced by the server rejecting any queries from unauthorized users that utilize these commands without execution. In future upgrades, a proposed security feature will also allow the server to track the total number of iterations a traveler has made in a cycle and provide a hard cutoff. For example, a user could submit code with a maximum of 5 iterations. + +## Operation Commands +### `.mark(name)` +Mark a segment in the stream processor, with a name, that can receive jumps. This command is used to label sections of the query operation list that can accept travelers from the `jump` command. + +**Parameters:** +- `name` (str): The name given to the marked segment. + +### jump(dest, condition, emit) +If a condition is true, send traveler to mark. If `emit` is True, also send a copy down the processing chain. If `condition` is None, always do the jump. This command is used to move travelers from one marked segment to another based on a specified condition. + +**Parameters:** +- `dest` (str): The name of the destination mark segment. Travelers are moved to this point when their position matches the `condition` parameter. +- `condition` (_expr_ or None): An expression that determines if the traveler should jump. If it evaluates to True, the traveler jumps to the specified destination. If None, the traveler always jumps to the specified destination. +- `emit` (bool): Determines whether a copy of the traveler is emitted down the processing chain after jumping. If False, only the original traveler is processed. + +### `.set(field, value)` +Set values within the traveler's memory. These values can be used to store cycle counts. This command sets a field in the traveler's memory to a specified value. + +**Parameters:** +- `field` (str): The name of the field to set. +- `value` (_expr_): The value to set for the specified field. This can be any valid GripQL expression that resolves to a scalar value. + +### `.increment(field, value)` +Increment a field by a specified value. This command increments a field in the traveler's memory by a specified amount. + +**Parameters:** +- `field` (str): The name of the field to increment. +- `value` (_expr_): The amount to increment the specified field by. This can be any valid GripQL expression that resolves to an integer value. + +## Example Queries +The following examples demonstrate how to use these commands in a query: + +```python +q = G.V("Character:1").set("count", 0).as_("start").mark("a").out().increment("$start.count") +q = q.has(gripql.lt("$start.count", 2)) +q = q.jump("a", None, True) +``` +This query starts from a vertex with the ID "Character:1". It sets a field named "count" to 0 and annotates this vertex as "start". Then it marks this position in the operation list for future reference. The `out` command moves travelers to the outgoing edges of their current positions, incrementing the "count" field each time. If the count is less than 2, the traveler jumps back to the marked location, effectively creating a loop. + +```python +q = G.V("Character:1").set("count", 0).as_("start").mark("a").out().increment("$start.count") +q = q.has(gripql.lt("$start.count", 2)) +q = q.jump("a", None, False) +``` +This query is similar to the previous one, but in this case, the traveler only jumps back without emitting a copy down the processing chain. The result is that only one vertex will be included in the output, even though there are multiple iterations due to the jump command. + +In both examples, the use of `mark` and `jump` commands create an iterative pattern within the query operation list, effectively creating a 'friend of a friend' search that can repeat as many times as desired. These patterns are crucial for complex graph traversals in GripQL. diff --git a/docs/tools/grip/queries/jobs.md b/docs/tools/grip/queries/jobs.md new file mode 100644 index 0000000..49e132f --- /dev/null +++ b/docs/tools/grip/queries/jobs.md @@ -0,0 +1,26 @@ + + + +## .submit() +Post the traversal as an asynchronous job and get a job ID. + +Example: Submit a query to be processed in the background + +```python +job_id = G.V('vertexID').hasLabel('Vertex').submit() +print(job_id) # print job ID +``` +--- + +## .searchJobs() +Find jobs that match this query and get their status and results if available. + +Example: Search for jobs with the specified query and print their statuses and results + +```python +for result in G.V('vertexID').hasLabel('Vertex').searchJobs(): + print(result['status']) # print job status + if 'results' in result: + print(result['results']) # print job results +``` +--- diff --git a/docs/tools/grip/queries/jsonpath.md b/docs/tools/grip/queries/jsonpath.md new file mode 100644 index 0000000..2f26511 --- /dev/null +++ b/docs/tools/grip/queries/jsonpath.md @@ -0,0 +1,84 @@ +--- +title: Referencing Fields +menu: + main: + parent: Queries + weight: 2 +--- + +# Referencing Vertex/Edge Properties + +Several operations (where, fields, render, etc.) reference properties of the vertices/edges during the traversal. +GRIP uses a variation on JSONPath syntax as described in http://goessner.net/articles/ to reference fields during traversals. + +The following query: + +``` +O.V(["ENSG00000012048"]).as_("gene").out("variant") +``` + +Starts at vertex `ENSG00000012048` and marks as `gene`: + +```json +{ + "_id": "ENSG00000012048", + "_label": "gene", + "symbol": { + "ensembl": "ENSG00000012048", + "hgnc": 1100, + "entrez": 672, + "hugo": "BRCA1" + }, + "transcipts": ["ENST00000471181.7", "ENST00000357654.8", "ENST00000493795.5"] +} +``` + +as "gene" and traverses the graph to: + +```json +{ + "_id": "NM_007294.3:c.4963_4981delTGGCCTGACCCCAGAAG", + "_label": "variant", + "type": "deletion", + "publications": [ + { + "pmid": 29480828, + "doi": "10.1097/MD.0000000000009380" + }, + { + "pmid": 23666017, + "doi": "10.1097/IGC.0b013e31829527bd" + } + ] +} +``` + +Below is a table of field and the values they would reference in subsequent traversal operations. + +| jsonpath | result | +| :------------------------- | :------------------- | +| _id | "NM_007294.3:c.4963_4981delTGGCCTGACCCCAGAAG" | +| _label | "variant" | +| type | "deletion" | +| publications[0].pmid | 29480828 | +| publications[:].pmid | [29480828, 23666017] | +| publications.pmid | [29480828, 23666017] | +| $gene.symbol.hugo | "BRCA1" | +| $gene.transcripts[0] | "ENST00000471181.7" | + + +## Usage Example: + +``` +O.V(["ENSG00000012048"]).as_("gene").out("variant").render({"variant_id": "_id", "variant_type": "type", "gene_id": "$gene._id"}) +``` + +returns + +``` +{ + "variant_id": "NM_007294.3:c.4963_4981delTGGCCTGACCCCAGAAG", + "variant_type": "deletion", + "gene_id": "ENSG00000012048" +} +``` diff --git a/docs/tools/grip/queries/output.md b/docs/tools/grip/queries/output.md new file mode 100644 index 0000000..964e776 --- /dev/null +++ b/docs/tools/grip/queries/output.md @@ -0,0 +1,74 @@ +--- +title: Output Control +menu: + main: + parent: Queries + weight: 10 +--- + +--- + +# Output control + +## `.limit(count)` +Limit number of total output rows +```python +G.V().limit(5) +``` +--- +## `.skip(count)` +Start return after offset + +Example: +```python +G.V().skip(10).limit(5) + +``` +This query skips the first 10 vertices and then returns the next 5. +--- +## `.range(start, stop)` +Selects a subset of the results based on their index. `start` is inclusive, and `stop` is exclusive. +Example: +```python +G.V().range(5, 10) +``` +--- +## `.fields([fields])` +Specifies which fields of a vertex or edge to include or exclude in the output. By default, `_id`, `_label`, `_from`, and `_to` are included. + +If `fields` is empty, all properties are excluded. +If `fields` contains field names, only those properties are included. +If `fields` contains field names prefixed with `-`, those properties are excluded, and all others are included. + +Examples: + +Include only the 'symbol' property: +```python +G.V("vertex1").fields(["symbol"]) +``` + +Exclude the 'symbol' property: +```python +G.V("vertex1").fields(["-symbol"]) +``` +Exclude all properties: +```python +G.V("vertex1").fields([]) +``` + +--- + +## `.render(template)` + +Transforms the current selection into an arbitrary data structure defined by the `template`. The `template` is a string that can include placeholders for vertex/edge properties. + +Example: +```python +G.V("vertex1").render( {"node_info" : {"id": "$._id", "label": "$._label"}, "data" : {"whatToExpect": "$.climate"}} ) +``` + +Assuming `vertex1` has `_id`, `_label`, and `symbol` properties, this would return a JSON object with those fields. + +```json +{"node_info" : {"id" :"Planet:2", "label":"Planet"}, "data":{"whatToExpect":"arid"} } +``` diff --git a/docs/tools/grip/queries/record_transforms.md b/docs/tools/grip/queries/record_transforms.md new file mode 100644 index 0000000..bf3d589 --- /dev/null +++ b/docs/tools/grip/queries/record_transforms.md @@ -0,0 +1,131 @@ +--- +title: Record Transforms +menu: + main: + parent: Queries + weight: 5 +--- + + +# Record Manipulation + +## `.unwind(fields)` +Expands an array-valued field into multiple rows, one for each element in the array. +Example: + +Graph +```python +{"vertex" : {"_id":"1", "_label":"Thing", "stuff" : ["1", "2", "3"]}} +``` + +Query +```python +G.V("1").unwind("stuff") +``` + +Result +```json +{"_id":"1", "_label":"Thing", "stuff" : "1"} +{"_id":"1", "_label":"Thing", "stuff" : "2"} +{"_id":"1", "_label":"Thing", "stuff" : "3"} +``` + +## `.group({"dest":"field"})` +Collect all travelers that are on the same element while aggregating specific fields + +For the example: +```python +G.V().hasLabel("Planet").as_("planet").out("residents").as_("character").select("planet").group( {"people" : "$character.name"} ) +``` + +All of the travelers that start on the same planet go out to residents, collect them using the `as_` and then returning to the origin + +using the `select` statement. The group statement aggrigates the `name` fields from the character nodes that were visited and collects them + +into a list named `people` that is added to the current planet node. + +Output: +```json +{ + "vertex": { + "_id": "Planet:2", + "_label": "Planet", + "climate": "temperate", + "diameter": 12500, + "gravity": null, + "name": "Alderaan", + "orbital_period": 364, + "people": [ + "Leia Organa", + "Raymus Antilles" + ], + "population": 2000000000, + "rotation_period": 24, + "surface_water": 40, + "system": { + "created": "2014-12-10T11:35:48.479000Z", + "edited": "2014-12-20T20:58:18.420000Z" + }, + "terrain": [ + "grasslands", + "mountains" + ], + "url": "https://swapi.co/api/planets/2/" + } +} +{ + "vertex": { + "_id": "Planet:1", + "_label": "Planet", + "climate": "arid", + "diameter": 10465, + "gravity": null, + "name": "Tatooine", + "orbital_period": 304, + "people": [ + "Luke Skywalker", + "C-3PO", + "Darth Vader", + "Owen Lars", + "Beru Whitesun lars", + "R5-D4", + "Biggs Darklighter" + ], + "population": 200000, + "rotation_period": 23, + "surface_water": 1, + "system": { + "created": "2014-12-09T13:50:49.641000Z", + "edited": "2014-12-21T20:48:04.175778Z" + }, + "terrain": [ + "desert" + ], + "url": "https://swapi.co/api/planets/1/" + } +} +``` + +## `.pivot(id, key, value)` + +Aggregate fields across multiple records into a single record using a pivot operations. A pivot is +an operation where a two column matrix, with one columns for keys and another column for values, is +transformed so that the keys are used to name the columns and the values are put in those columns. + +So the stream of vertices: + +``` +{"_id":"observation_a1", "_label":"Observation", "subject":"Alice", "key":"age", "value":36} +{"_id":"observation_a2", "_label":"Observation", "subject":"Alice", "key":"sex", "value":"Female"} +{"_id":"observation_a3", "_label":"Observation", "subject":"Alice", "key":"blood_pressure", "value":"111/78"} +{"_id":"observation_b1", "_label":"Observation", "subject":"Bob", "key":"age", "value":42} +{"_id":"observation_b2", "_label":"Observation", "subject":"Bob", "key":"sex", "value":"Male"} +{"_id":"observation_b3", "_label":"Observation", "subject":"Bob", "key":"blood_pressure", "value":"120/80"} +``` + +with `.pivot("subject", "key", "value")` will produce: + +``` +{"_id":"Alice", "age":36, "sex":"Female", "blood_pressure":"111/78"} +{"_id":"Bob", "age":42, "sex":"Male", "blood_pressure":"120/80"} +``` diff --git a/docs/tools/grip/queries/traversal_start.md b/docs/tools/grip/queries/traversal_start.md new file mode 100644 index 0000000..6a4fd1d --- /dev/null +++ b/docs/tools/grip/queries/traversal_start.md @@ -0,0 +1,30 @@ + +--- +title: Start a Traversal +menu: + main: + parent: Queries + weight: 1 +--- + +# Start a Traversal + +All traversal based queries must start with a `V()` command, starting the travalers on the vertices of the graph. + +## `.V([ids])` +Start query from Vertex + +```python +G.V() +``` + +Returns all vertices in graph + +```python +G.V(["vertex1"]) +``` + +Returns: +```json +{"_id" : "vertex1", "_label":"TestVertex"} +``` diff --git a/docs/tools/grip/queries/traverse_graph.md b/docs/tools/grip/queries/traverse_graph.md new file mode 100644 index 0000000..568f27d --- /dev/null +++ b/docs/tools/grip/queries/traverse_graph.md @@ -0,0 +1,76 @@ +--- +title: Traverse the Graph +menu: + main: + parent: Queries + weight: 3 +--- + +# Traverse the graph +To move travelers between different elements of the graph, the traversal commands `in_` and `out` move along the edges, respecting the directionality. The `out` commands follow `_from` to `_to`, while the `in_` command follows `_to` to `_from`. + +## `.in_(), inV()` +Following incoming edges. Optional argument is the edge label (or list of labels) that should be followed. If no argument is provided, all incoming edges. + +```python +G.V().in_(label=['edgeLabel1', 'edgeLabel2']) +``` +--- + +## `.out(), .outV()` +Following outgoing edges. Optional argument is the edge label (or list of labels) that should be followed. If no argument is provided, all outgoing edges. + +```python +G.V().out(label='edgeLabel') +``` +--- + +## `.both(), .bothV()` +Following all edges (both in and out). Optional argument is the edge label (or list of labels) that should be followed. If no argument is provided, all edges. + +```python +G.V().outE().both(label='edgeLabel') +``` +--- + +## `.inE()` +Following incoming edges, but return the edge as the next element. This can be used to inspect edge properties. Optional argument is the edge label (or list of labels) that should be followed. To return back to a vertex, use `.in_` or `.out` + +```python +G.V().inE(label='edgeLabel') +``` +--- + +## `.outE()` +Following outgoing edges, but return the edge as the next element. This can be used to inspect edge properties. Optional argument is the edge label (or list of labels) that should be followed. To return back to a vertex, use `.in_` or `.out` + +```python +G.V().outE(label='edgeLabel') +``` +--- + +## `.bothE()` +Following all edges, but return the edge as the next element. This can be used to inspect edge properties. Optional argument is the edge label (or list of labels) that should be followed. To return back to a vertex, use `.in_` or `.out` + +```python +G.V().bothE(label='edgeLabel') +``` +--- + +# AS and SELECT + +The `as_` and `select` commands allow a traveler to mark a step in the traversal and return to it as a later step. + +## `.as_(name)` +Store current row for future reference + +```python +G.V().as_("a").out().as_("b") +``` + +## `.select(name)` +Move traveler to previously marked position + +```python +G.V().mark("a").out().mark("b").select("a") +``` diff --git a/docs/tools/grip/security/basic.md b/docs/tools/grip/security/basic.md new file mode 100644 index 0000000..4bf232e --- /dev/null +++ b/docs/tools/grip/security/basic.md @@ -0,0 +1,60 @@ +--- +title: Basic Auth + +menu: + main: + parent: Security + weight: 1 +--- + +# Basic Auth + +By default, an GRIP server allows open access to its API endpoints, but it +can be configured to require basic password authentication. To enable this, +include users and passwords in your config file: + +```yaml +Server: + BasicAuth: + - User: testuser + Password: abc123 +``` + +Make sure to properly protect the configuration file so that it's not readable +by everyone: + +```bash +$ chmod 600 grip.config.yml +``` + +To use the password, set the `GRIP_USER` and `GRIP_PASSWORD` environment variables: +```bash +$ export GRIP_USER=testuser +$ export GRIP_PASSWORD=abc123 +$ grip list +``` + +## Using the Python Client + +Some GRIP servers may require authorizaiton to access its API endpoints. The client can be configured to pass +authorization headers in its requests: + +```python +import gripql + +# Basic Auth Header - {'Authorization': 'Basic dGVzdDpwYXNzd29yZA=='} +G = gripql.Connection("https://bmeg.io", user="test", password="password").graph("bmeg") +``` + +Although GRIP only supports basic password authentication, some servers may be proctected via a nginx or apache +server. The python client can be configured to handle these cases as well: + +```python +import gripql + +# Bearer Token - {'Authorization': 'Bearer iamnotarealtoken'} +G = gripql.Connection("https://bmeg.io", token="iamnotarealtoken").graph("bmeg") + +# OAuth2 / Custom - {"OauthEmail": "fake.user@gmail.com", "OauthAccessToken": "iamnotarealtoken", "OauthExpires": 1551985931} +G = gripql.Connection("https://bmeg.io", credential_file="~/.grip_token.json").graph("bmeg") +``` diff --git a/docs/tools/grip/tutorials/amazon.md b/docs/tools/grip/tutorials/amazon.md new file mode 100644 index 0000000..f215d7c --- /dev/null +++ b/docs/tools/grip/tutorials/amazon.md @@ -0,0 +1,75 @@ +--- +title: Amazon Purchase Network + +menu: + main: + parent: Tutorials + weight: 1 +--- + +# Explore Amazon Product Co-Purchasing Network Metadata + +Download the data + +``` +curl -O http://snap.stanford.edu/data/bigdata/amazon/amazon-meta.txt.gz +``` + +Convert the data into vertices and edges + +``` +python $GOPATH/src/github.com/bmeg/grip/example/amazon_convert.py amazon-meta.txt.gz amazon.data +``` + +Turn on grip and create a graph called 'amazon' + +``` +grip server & ; sleep 1 ; grip create amazon +``` + +Load the vertices/edges into the graph + +``` +grip load amazon --edge amazon.data.edge --vertex amazon.data.vertex +``` + +Query the graph + +_command line client_ + +``` +grip query amazon 'V().hasLabel("Video").out()' +``` + +The full command syntax and command list can be found at grip/gripql/javascript/gripql.js + +_python client_ + +Initialize a virtual environment and install gripql python package + +``` +python -m venv venv ; source venv/bin/activate +pip install -e gripql/python +``` + +Example code + +```python +import gripql + +conn = gripql.Connection("http://localhost:8201") + +g = conn.graph("amazon") + +# Count the Vertices +print("Total vertices: ", g.V().count().execute()) +# Count the Edges +print("Total edges: ", g.V().outE().count().execute()) + +# Try simple travesral +print("Edges connected to 'B00000I06U' vertex: %s" %g.V("B00000I06U").outE().execute()) + +# Find every Book that is similar to a DVD +for result in g.V().has(gripql.eq("group", "Book")).as_("a").out("similar").has(gripql.eq("group", "DVD")).as_("b").select("a"): + print(result) +``` diff --git a/docs/tools/grip/tutorials/pathway-commons.md b/docs/tools/grip/tutorials/pathway-commons.md new file mode 100644 index 0000000..d0d2308 --- /dev/null +++ b/docs/tools/grip/tutorials/pathway-commons.md @@ -0,0 +1,11 @@ + + +Get Pathway Commons release +``` +curl -O http://www.pathwaycommons.org/archives/PC2/v10/PathwayCommons10.All.BIOPAX.owl.gz +``` + +Convert to Property Graph +``` +grip rdf --dump --gzip pc PathwayCommons10.All.BIOPAX.owl.gz -m "http://pathwaycommons.org/pc2/#=pc:" -m "http://www.biopax.org/release/biopax-level3.owl#=biopax:" +``` diff --git a/docs/tools/grip/tutorials/tcga-rna.md b/docs/tools/grip/tutorials/tcga-rna.md new file mode 100644 index 0000000..3098295 --- /dev/null +++ b/docs/tools/grip/tutorials/tcga-rna.md @@ -0,0 +1,133 @@ +--- +title: TCGA RNA Expression + +menu: + main: + parent: Tutorials + weight: 2 +--- + +### Explore TCGA RNA Expression Data + +Create the graph + +``` +grip create tcga-rna +``` + +Get the data + +``` +curl -O http://download.cbioportal.org/gbm_tcga_pub2013.tar.gz +tar xvzf gbm_tcga_pub2013.tar.gz +``` + +Load clinical data + +``` +./example/load_matrix.py tcga-rna gbm_tcga_pub2013/data_clinical.txt --row-label 'Donor' +``` + +Load RNASeq data + +``` +./example/load_matrix.py tcga-rna gbm_tcga_pub2013/data_RNA_Seq_v2_expression_median.txt -t --index-col 1 --row-label RNASeq --row-prefix "RNA:" --exclude RNA:Hugo_Symbol +``` + +Connect RNASeq data to Clinical data + +``` +./example/load_matrix.py tcga-rna gbm_tcga_pub2013/data_RNA_Seq_v2_expression_median.txt -t --index-col 1 --no-vertex --edge 'RNA:{_id}' rna +``` + +Connect Clinical data to subtypes + +``` +./example/load_matrix.py tcga-rna gbm_tcga_pub2013/data_clinical.txt --no-vertex -e "{EXPRESSION_SUBTYPE}" subtype --dst-vertex "{EXPRESSION_SUBTYPE}" Subtype +``` + +Load Hugo Symbol to EntrezID translation table from RNA matrix annotations + +``` +./example/load_matrix.py tcga-rna gbm_tcga_pub2013/data_RNA_Seq_v2_expression_median.txt --column-include Entrez_Gene_Id --row-label Gene +``` + +Load Mutation Information + +``` +./example/load_matrix.py tcga-rna gbm_tcga_pub2013/data_mutations_extended.txt --skiprows 1 --index-col -1 --regex Matched_Norm_Sample_Barcode '\-\d\d$' '' --edge '{Matched_Norm_Sample_Barcode}' variantIn --edge '{Hugo_Symbol}' effectsGene --column-exclude ma_func.impact ma_fi.score MA_FI.score MA_Func.Impact MA:link.MSA MA:FImpact MA:protein.change MA:link.var MA:FIS MA:link.PDB --row-label Variant +``` + +Load Proneural samples into a matrix + +```python +import pandas +import gripql + +conn = gripql.Connection("http://localhost:8201") +g = conn.graph("tcga-rna") +genes = {} +for k, v in g.V().hasLabel("Gene").render(["_id", "Hugo_Symbol"]): + genes[k] = v +data = {} +for row in g.V("Proneural").in_().out("rna").render(["_id", "_data"]): + data[row[0]] = row[1] +samples = pandas.DataFrame(data).rename(genes).transpose().fillna(0.0) +``` + +# Matrix Load project + +``` +usage: load_matrix.py [-h] [--sep SEP] [--server SERVER] + [--row-label ROW_LABEL] [--row-prefix ROW_PREFIX] [-t] + [--index-col INDEX_COL] [--connect] + [--col-label COL_LABEL] [--col-prefix COL_PREFIX] + [--edge-label EDGE_LABEL] [--edge-prop EDGE_PROP] + [--columns [COLUMNS [COLUMNS ...]]] + [--column-include COLUMN_INCLUDE] [--no-vertex] + [-e EDGE EDGE] [--dst-vertex DST_VERTEX DST_VERTEX] + [-x EXCLUDE] [-d] + db input + +positional arguments: + db Destination Graph + input Input File + +optional arguments: + -h, --help show this help message and exit + --sep SEP TSV delimiter + --server SERVER Server Address + --row-label ROW_LABEL + Vertex Label used when loading rows + --row-prefix ROW_PREFIX + Prefix added to row vertex id + -t, --transpose Transpose matrix + --index-col INDEX_COL + Column number to use as index (and id for vertex + load) + --connect Switch to 'fully connected mode' and load matrix cell + values on edges between row and column names + --col-label COL_LABEL + Column vertex label in 'connect' mode + --col-prefix COL_PREFIX + Prefix added to col vertex id in 'connect' mode + --edge-label EDGE_LABEL + Edge label for edges in 'connect' mode + --edge-prop EDGE_PROP + Property name for storing value when in 'connect' mode + --columns [COLUMNS [COLUMNS ...]] + Rename columns in TSV + --column-include COLUMN_INCLUDE + List subset of columns to use from TSV + --no-vertex Do not load row as vertex + -e EDGE EDGE, --edge EDGE EDGE + Create an edge the connected the current row vertex + args: + --dst-vertex DST_VERTEX DST_VERTEX + Create a destination vertex, args: + + -x EXCLUDE, --exclude EXCLUDE + Exclude row id + -d Run in debug mode. Print actions and make no changes + +``` diff --git a/docs/tools/index.md b/docs/tools/index.md new file mode 100644 index 0000000..195f28d --- /dev/null +++ b/docs/tools/index.md @@ -0,0 +1,34 @@ +# CALYPR Tools Ecosystem + +The CALYPR platform provides a suite of powerful, open-source tools designed to handle every stage of the genomic data lifecycle—from ingestion and versioning to distributed analysis and graph-based discovery. + +--- + +### [Git-DRS](git-drs/index.md) +**The Version Control Layer.** +Git-DRS is a specialized extension for Git that manages massive genomic datasets using the GA4GH Data Repository Service (DRS) standard. It allows researchers to track, version, and share petabyte-scale files as easily as code, replacing heavy binaries with lightweight pointer files that resolve to immutable cloud objects. + +### [Funnel](funnel/index.md) +**The Compute Layer.** +Funnel is a distributed task execution engine that implements the GA4GH Task Execution Service (TES) API. It provides a standardized way to run Docker-based analysis pipelines across diverse environments—including Kubernetes, AWS, and Google Cloud—ensuring that your workflows are portable and independent of the underlying infrastructure. + +### [GRIP](grip/index.md) +**The Discovery Layer.** +GRIP (Graph Resource Integration Platform) is a high-performance graph database and query engine designed for complex biological data. It enables analysts to integrate heterogeneous datasets into a unified knowledge graph and perform sophisticated queries that reveal deep relational insights across multi-omic cohorts. + + +--- + +## Choosing the Right Tool + +| If you want to... | Use this tool | +| --- | --- | +| Version and share large genomic files | **Git-DRS** | +| Run batch analysis or Nextflow pipelines | **Funnel** | +| Query complex relationships between datasets | **GRIP** | +| Access Gen3 data from the command line | **Data Client** | + +--- + +!!! tip "Getting Started" + If you are new to the platform, we recommend starting with the [Quick Start Guide](../calypr/quick-start.md) to install the necessary binaries and set up your first workspace. diff --git a/docs/tools/sifter/.nav.yml b/docs/tools/sifter/.nav.yml new file mode 100644 index 0000000..67eeab2 --- /dev/null +++ b/docs/tools/sifter/.nav.yml @@ -0,0 +1,9 @@ +title: Sifter +nav: + - index.md + - docs/example.md + - docs/schema.md + - docs/config.md + - docs/inputs + - docs/transforms + - docs/outputs diff --git a/docs/tools/sifter/assets/sifter_example.png b/docs/tools/sifter/assets/sifter_example.png new file mode 100644 index 0000000..284e0dd Binary files /dev/null and b/docs/tools/sifter/assets/sifter_example.png differ diff --git a/docs/tools/sifter/docs/.nav.yml b/docs/tools/sifter/docs/.nav.yml new file mode 100644 index 0000000..1d7fa65 --- /dev/null +++ b/docs/tools/sifter/docs/.nav.yml @@ -0,0 +1,11 @@ + +title: Sifter Documentation + +nav: + - index.md + - example.md + - schema.md + - config.md + - inputs + - transforms + - outputs \ No newline at end of file diff --git a/docs/tools/sifter/docs/config.md b/docs/tools/sifter/docs/config.md new file mode 100644 index 0000000..38ab63d --- /dev/null +++ b/docs/tools/sifter/docs/config.md @@ -0,0 +1,34 @@ +--- +title: Paramaters +--- + +## Paramaters Variables + +Playbooks can be parameterized. They are defined in the `params` section of the playbook YAML file. + +### Configuration Syntax +```yaml +params: + variableName: + type: File # one of: File, Path, String, Number + default: "path/to/default" +``` + +### Supported Types +- `File`: Represents a file path +- `Dir`: Represents a directory path + +### Example Configuration +```yaml +params: + inputDir: + type: Dir + default: "/data/input" + outputDir: + type: Dir + default: "/data/output" + schemaFile: + type: File + default: "/config/schema.json" +``` + diff --git a/docs/tools/sifter/docs/developers/source_mapping.md b/docs/tools/sifter/docs/developers/source_mapping.md new file mode 100644 index 0000000..335e52f --- /dev/null +++ b/docs/tools/sifter/docs/developers/source_mapping.md @@ -0,0 +1,48 @@ +# SIFTER Project Documentation to Source Code Mapping + +## Inputs + +| Documentation File | Source Code File | +|-------------------|------------------| +| docs/docs/inputs/avro.md | extractors/avro_load.go | +| docs/docs/inputs/embedded.md | extractors/embedded.go | +| docs/docs/inputs/glob.md | extractors/glob_load.go | +| docs/docs/inputs/json.md | extractors/json_load.go | +| docs/docs/inputs/plugin.md | extractors/plugin_load.go | +| docs/docs/inputs/sqldump.md | extractors/sqldump_step.go | +| docs/docs/inputs/sqlite.md | extractors/sqlite_load.go | +| docs/docs/inputs/table.md | extractors/tabular_load.go | +| docs/docs/inputs/xml.md | extractors/xml_step.go | + +## Transforms + +| Documentation File | Source Code File | +|-------------------|------------------| +| docs/docs/transforms/accumulate.md | transform/accumulate.go | +| docs/docs/transforms/clean.md | transform/clean.go | +| docs/docs/transforms/debug.md | transform/debug.go | +| docs/docs/transforms/distinct.md | transform/distinct.go | +| docs/docs/transforms/fieldParse.md | transform/field_parse.go | +| docs/docs/transforms/fieldProcess.md | transform/field_process.go | +| docs/docs/transforms/fieldType.md | transform/field_type.go | +| docs/docs/transforms/filter.md | transform/filter.go | +| docs/docs/transforms/flatmap.md | transform/flat_map.go | +| docs/docs/transforms/from.md | transform/from.go | +| docs/docs/transforms/hash.md | transform/hash.go | +| docs/docs/transforms/lookup.md | transform/lookup.go | +| docs/docs/transforms/map.md | transform/mapping.go | +| docs/docs/transforms/objectValidate.md | transform/object_validate.go | +| docs/docs/transforms/plugin.md | transform/plugin.go | +| docs/docs/transforms/project.md | transform/project.go | +| docs/docs/transforms/reduce.md | transform/reduce.go | +| docs/docs/transforms/regexReplace.md | transform/regex.go | +| docs/docs/transforms/split.md | transform/split.go | +| docs/docs/transforms/uuid.md | transform/uuid.go | + +## Outputs + +| Documentation File | Source Code File | +|-------------------|------------------| +| docs/docs/outputs/graphBuild.md | playbook/output_graph.go | +| docs/docs/outputs/json.md | playbook/output_json.go | +| docs/docs/outputs/tableWrite.md | playbook/output_table.go | \ No newline at end of file diff --git a/docs/tools/sifter/docs/example.md b/docs/tools/sifter/docs/example.md new file mode 100644 index 0000000..3b0e03f --- /dev/null +++ b/docs/tools/sifter/docs/example.md @@ -0,0 +1,196 @@ +--- +render_macros: false +--- + +# Example Pipeline +Our first task will be to convert a ZIP code TSV into a set of county level +entries. + +The input file looks like: + +```csv +ZIP,COUNTYNAME,STATE,STCOUNTYFP,CLASSFP +36003,Autauga County,AL,01001,H1 +36006,Autauga County,AL,01001,H1 +36067,Autauga County,AL,01001,H1 +36066,Autauga County,AL,01001,H1 +36703,Autauga County,AL,01001,H1 +36701,Autauga County,AL,01001,H1 +36091,Autauga County,AL,01001,H1 +``` + +First is the header of the pipeline. This declares the +unique name of the pipeline and it's output directory. + +```yaml +name: zipcode_map +outdir: ./ +docs: Converts zipcode TSV into graph elements +``` + +Next the parameters are declared. In this case the only parameter is the path to the +zipcode TSV. There is a default value, so the pipeline can be invoked without passing in +any parameters. However, to apply this pipeline to a new input file, the +input parameter `zipcode` could be used to define the source file. +Path and File Parameters can be relative to the directory that the playbook file is in. + +```yaml +params: + schema: + type: path + default: ../covid19_datadictionary/gdcdictionary/schemas/ + zipcode: + type: path + default: ../data/ZIP-COUNTY-FIPS_2017-06.csv +``` + +The `inputs` section declares data input sources. In this pipeline, there is +only one input, which is to run the table loader. +```yaml +inputs: + zipcode: + table: + path: "{{params.zipcode}}" + sep: "," +``` + +Tableload operaters of the input file that was originally passed in using the +`inputs` stanza. SIFTER string parsing is based on mustache template system. +To access the string passed in the template is `{{params.zipcode}}`. +The seperator in the file input file is a `,` so that is also passed in as a +parameter to the extractor. + + +The `table` extractor opens up the TSV and generates a one message for +every row in the file. It uses the header of the file to map the column values +into a dictionary. The first row would produce the message: + +```json +{ + "ZIP" : "36003", + "COUNTYNAME" : "Autauga County", + "STATE" : "AL", + "STCOUNTYFP" : "01001", + "CLASSFP" : "H1" +} +``` + +The stream of messages are then passed into the steps listed in the `transform` +section of the tableLoad extractor. + +For the current tranform, we want to produce a single entry per `STCOUNTYFP`, +however, the file has a line per `ZIP`. We need to run a `reduce` transform, +that collects rows togeather using a field key, which in this case is `"{{row.STCOUNTYFP}}"`, +and then runs a function `merge` that takes two messages, merges them togeather +and produces a single output message. + +The two messages: + +```json +{ "ZIP" : "36003", "COUNTYNAME" : "Autauga County", "STATE" : "AL", "STCOUNTYFP" : "01001", "CLASSFP" : "H1"} +{ "ZIP" : "36006", "COUNTYNAME" : "Autauga County", "STATE" : "AL", "STCOUNTYFP" : "01001", "CLASSFP" : "H1"} +``` + +Would be merged into the message: + +```json +{ "ZIP" : ["36003", "36006"], "COUNTYNAME" : "Autauga County", "STATE" : "AL", "STCOUNTYFP" : "01001", "CLASSFP" : "H1"} +``` + +The `reduce` transform step uses a block of python code to describe the function. +The `method` field names the function, in this case `merge` that will be used +as the reduce function. + +```yaml + zipReduce: + - from: zipcode + - reduce: + field: STCOUNTYFP + method: merge + python: > + def merge(x,y): + a = x.get('zipcodes', []) + [x['ZIP']] + b = y.get('zipcodes', []) + [y['ZIP']] + x['zipcodes'] = a + b + return x +``` + +The original messages produced by the loader have all of the information required +by the `summary_location` object type as described by the JSON schema that was linked +to in the header stanza. However, the data is all under the wrong field names. +To remap the data, we use a `project` tranformation that uses the template engine +to project data into new files in the message. The template engine has the current +message data in the value `row`. So the value +`FIPS:{{row.STCOUNTYFP}}` is mapped into the field `id`. + +```yaml + - project: + mapping: + id: "FIPS:{{row.STCOUNTYFP}}" + province_state: "{{row.STATE}}" + summary_locations: "{{row.STCOUNTYFP}}" + county: "{{row.COUNTYNAME}}" + submitter_id: "{{row.STCOUNTYFP}}" + type: summary_location + projects: [] +``` + +Using this projection, the message: + +```json +{ + "ZIP" : ["36003", "36006"], + "COUNTYNAME" : "Autauga County", + "STATE" : "AL", + "STCOUNTYFP" : "01001", + "CLASSFP" : "H1" +} +``` + +would become + +```json +{ + "id" : "FIPS:01001", + "province_state" : "AL", + "summary_locations" : "01001", + "county" : "Autauga County", + "submitter_id" : "01001", + "type" : "summary_location" + "projects" : [], + "ZIP" : ["36003", "36006"], + "COUNTYNAME" : "Autauga County", + "STATE" : "AL", + "STCOUNTYFP" : "01001", + "CLASSFP" : "H1" +} +``` + +Now that the data has been remapped, we pass the data into the 'objectValidate' +step, which will open the schema directory and find the class titled `summary_location`, check the +message to make sure it matches and then output it. + +```yaml + - objectValidate: + title: summary_location + schema: {{params.schema}} +``` + + +Outputs + +To create an output table, with two columns connecting +`ZIP` values to `STCOUNTYFP` values. The `STCOUNTYFP` is a county level FIPS +code, used by the census office. A single FIPS code my contain many ZIP codes, +and we can use this table later for mapping ids when loading the data into a database. + +```yaml +outputs: + zip2fips: + tableWrite: + from: zipReduce + path: zip2fips.tsv + columns: + - ZIP + - STCOUNTYFP +``` diff --git a/docs/tools/sifter/docs/index.md b/docs/tools/sifter/docs/index.md new file mode 100644 index 0000000..56e1b1d --- /dev/null +++ b/docs/tools/sifter/docs/index.md @@ -0,0 +1,194 @@ +--- +title: Sifter +render_macros: false +--- + + +# Sifter + +Sifter is a stream based processing engine. It comes with a number of +file extractors that operate as inputs to these pipelines. The pipeline engine +connects togeather several processing data into directed acylic graph that is processed +in parallel. + +Example Message: + +```json +{ + "firstName" : "bob", + "age" : "25" + "friends" : [ "Max", "Alex"] +} +``` + +Once a stream of messages are produced, that can be run through a transform +pipeline. A transform pipeline is an array of transform steps, each transform +step can represent a different way to alter the data. The array of transforms link +togeather into a pipe that makes multiple alterations to messages as they are +passed along. There are a number of different transform steps types that can +be done in a transform pipeline these include: + + - Projection: creating new fields using a templating engine driven by existing values + - Filtering: removing messages + - Programmatic transformation: alter messages using an embedded python interpreter + - Table based field translation + - Outputing the message as a JSON Schema checked object + + +# Script structure + +# Pipeline File + +An sifter pipeline file is in YAML format and describes an entire processing pipelines. +If is composed of the following sections: `params`, `inputs`, `pipelines`, `outputs`. In addition, +for tracking, the file will also include `name` and `class` entries. + +```yaml + +class: sifter +name: CALYPR diff --git a/overrides/home.html b/overrides/home.html new file mode 100644 index 0000000..923ddf0 --- /dev/null +++ b/overrides/home.html @@ -0,0 +1,23 @@ +{% extends "main.html" %} + +{% block tabs %} +{{ super() }} +
+
+
+

CALYPR

+
Unlocking biological insights with scalable, cloud/on-prem hybrid infrastructure. +
+
+
+
+{% endblock %} + + +{% block content %} +{{ super() }} +{% endblock %} + +{% block footer %} +{{ super() }} +{% endblock %} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 7f415fe..df39fe1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ mkdocs-material mkdocs-macros-plugin mkdocs-open-in-new-tab mkdocs-linkcheck - +mkdocs-awesome-nav +termynal \ No newline at end of file