diff --git a/Project.toml b/Project.toml index 50e4b3f..0605d15 100644 --- a/Project.toml +++ b/Project.toml @@ -18,6 +18,7 @@ Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59" IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953" JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +MUSCLE_jll = "0d2b832c-22e5-5080-b34f-49cc6141c4c5" MultivariateStats = "6f286f6a-111f-5878-ab1e-185364afe411" MutableConvexHulls = "948c7aac-0e5e-4631-af23-7a6bb7a17825" OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" @@ -47,7 +48,7 @@ Downloads = "1" FASTX = "2" FixedPointNumbers = "0.8" GZip = "0.6" -GaussianMixtureAlignment = "0.2.2, 0.3" +GaussianMixtureAlignment = "0.2.2, 0.3, 0.4" HTTP = "1" HiGHS = "1" Hungarian = "0.6, 0.7" @@ -57,6 +58,7 @@ JSON3 = "1" JuMP = "1" LinearAlgebra = "1" MIToS = "3" +MUSCLE_jll = "5.2.0" MultivariateStats = "0.9, 0.10" MutableConvexHulls = "0.2" OffsetArrays = "1" diff --git a/src/GPCRAnalysis.jl b/src/GPCRAnalysis.jl index 29e103d..9ccf461 100644 --- a/src/GPCRAnalysis.jl +++ b/src/GPCRAnalysis.jl @@ -31,6 +31,9 @@ using GZip using FixedPointNumbers using ColorTypes +# MUSCLE +using MUSCLE_jll + const ChainLike = Union{Chain, AbstractVector{<:AbstractResidue}} # an iterable of residues const ResidueLike = Union{Residue, AbstractVector{<:AbstractAtom}} # an iterable of atoms const StructureLike = Union{ChainLike, Model, MolecularStructure} @@ -39,10 +42,10 @@ const StructureLike = Union{ChainLike, Model, MolecularStructure} export SequenceMapping, AccessionCode, MSACode, NWGapCosts export sequenceindexes, columnindexes, isgap, isunknown, sequencekeys, msasequence, residuematrix, subseqs, subseqs! -export species, uniprotX, query_uniprot_accession, query_ebi_proteins, query_ncbi +export species, uniprotX, query_uniprot_accession, query_ebi_proteins, query_ncbi, query_tm_count export try_download_alphafold, query_alphafold_latest, download_alphafolds, alphafoldfile, alphafoldfiles, getchain, writechain, findall_subseq, pLDDT, pLDDTcolor -export align_to_axes, align_to_membrane, align_nw, align_ranges, map_closest, align_closest +export align_to_axes, align_to_membrane, align_nw, align_ranges, map_closest, align_closest, align_muscle, align_family export filter_species!, filter_long!, sortperm_msa, chimerax_script export project_sequences, columnwise_entropy, align, residue_centroid, residue_centroid_matrix, alphacarbon_coordinates, alphacarbon_coordinates_matrix, chargelocations, positive_locations, negative_locations diff --git a/src/align.jl b/src/align.jl index 0b96a4e..6b21deb 100644 --- a/src/align.jl +++ b/src/align.jl @@ -82,9 +82,8 @@ end Compute the rigid transformation `tform` needed to align `chain` to the membrane, given the transmembrane segments `tms` as residue-indexes (e.g., -`[37:61, 74:96, ...]`). `extracelluar` should be true if the N-terminus of the -protein is extracellular (`chain[first(tms[1])]` is at the extracellular face), -and false otherwise. +`[37:61, 74:96, ...]`). `extracellular` should be true if the N-terminus of the +chain is extracellular, and false otherwise. `applytransform!(chain, tform)` (or the `model` that includes `chain`) will re-orient `chain` so that the center of the membrane is `z=0` and extracellular @@ -169,6 +168,169 @@ function collect_leaflet_residues(chain, tms, extracellular::Bool) return leaflet_e, leaflet_i end +## Alignment using MUSCLE +# https://github.com/rcedgar/muscle + +""" + align_muscle(infile::AbstractString, outfile::AbstractString) + +Run [MUSCLE](https://github.com/rcedgar/muscle) to align the sequences in +`infile` and write the aligned sequences to `outfile`. Both files should be in +FASTA format. + +# Examples + +Let's get the sequences of two receptors and then align them with MUSCLE: + +``` +julia> open("infile.fasta", "w") do io + write(io, query_ebi_proteins(["Q8R256", "E9Q8I6"]; format=:fasta)) +end + +julia> align_muscle("infile.fasta", "outfile.fasta") +``` +""" +align_muscle(infile::AbstractString, outfile::AbstractString) = run(`$(muscle()) -align $infile -output $outfile`) + +""" + msa = align_family(; ids=nothing, msafile, srcdir, destdir, id_for_tms, + conserved_params=AlignFamilyParams()) + +Download and structurally align AlphaFold models for a family of receptors. + +# Arguments +- `msafile`: path to the FASTA MSA file. This anchors the workflow: if the file does not + exist yet, `ids` must be supplied to create it; if it already exists (e.g. when + resuming an interrupted run), it is read directly and `ids` is not needed. +- `ids`: a vector of UniProt accession codes, required only to create `msafile`. + Sequences are fetched from EBI in batches of 50 and aligned with MUSCLE. +- `srcdir`: directory for downloaded AlphaFold structure files. +- `destdir`: directory for the aligned output CIF files. +- `id_for_tms`: UniProt accession of the reference receptor. Must appear in the MSA and + have an AlphaFold structure. Its TRANSMEM annotations from EBI are used to orient all + structures to the membrane normal. Use [`query_tm_count`](@ref) to find an accession + with a complete TM annotation. +- `conserved_params`: an [`AlignFamilyParams`](@ref) controlling which MSA columns serve + as alignment anchors. + +# Returns +The MSA as a `Vector{FASTX.FASTA.Record}`. + +# Steps +1. Create the MSA from `ids` if `msafile` does not already exist. +2. Fetch TRANSMEM annotations for `id_for_tms` from EBI. +3. Download AlphaFold structures for all MSA sequences into `srcdir`. +4. Identify conserved, low-gap MSA columns as structural alignment anchors. +5. Align the reference structure to the membrane normal. +6. Align every structure to the reference using α-carbon positions at the conserved + columns, and write the results to `destdir`. +""" +function align_family(; + ids=nothing, + msafile, + srcdir, + destdir, + id_for_tms, + conserved_params=AlignFamilyParams(), + ) + # Step 1: create MSA from ids if msafile doesn't exist yet + if !isfile(msafile) + ids !== nothing || throw(ArgumentError( + "`msafile` \"$msafile\" does not exist; supply `ids` to create it" + )) + mktemp() do seqfile, seqio + for i in firstindex(ids):50:lastindex(ids) + result = query_ebi_proteins(ids[i:min(i+49, lastindex(ids))]; format=:fasta) + result === nothing && continue + write(seqio, result) + end + close(seqio) + align_muscle(seqfile, msafile) + end + end + + msa = FASTAReader(open(msafile)) do io + collect(io) + end + + # Step 2: fetch TM segment annotations for the reference + ref_entry = query_ebi_proteins(id_for_tms) + (ref_entry === nothing || isempty(ref_entry)) && error("EBI query failed for $id_for_tms") + reftms = [parse(Int, f["begin"]):parse(Int, f["end"]) + for f in only(ref_entry).features + if f["type"] == "TRANSMEM"] + isempty(reftms) && error("No TRANSMEM features found for $id_for_tms; try a different `id_for_tms`") + length(reftms) == 7 || error("Need 7 TM segments for $id_for_tms, but found $(length(reftms)); try a different `id_for_tms`") + + # Step 3: download AlphaFold structures + isdir(srcdir) || mkdir(srcdir) + download_alphafolds(msa; dirname=srcdir) + + # Step 4: identify conserved, low-gap MSA columns as alignment anchors + # columnindexes filters out all-gap columns; map conserved back to full-sequence positions + colidxs = columnindexes(msa) + resmtrx = residuematrix(msa) # columns correspond to colidxs + colentropy = columnwise_entropy(msa) + gapfrac = vec(mean(resmtrx .== '-', dims=1)) + ce = [gapfrac[j] > conserved_params.gapfrac_max ? Inf : colentropy[j] + for j in eachindex(colentropy)] + conserved = colidxs[findall(ce .< conserved_params.maxentropy)] + conserved_set = Set(conserved) + + # Find the reference sequence in the MSA + msaidx = Dict(uniprotX(identifier(msa[i])) => i for i in eachindex(msa)) + refseqidx = get(msaidx, id_for_tms, nothing) + refseqidx === nothing && error("$id_for_tms not found in MSA ($msafile)") + + # Step 5: align reference structure to membrane + reffile = alphafoldfile(id_for_tms, srcdir; join=true) + reffile === nothing && error("AlphaFold structure for $id_for_tms not found in $srcdir") + refchain = getchain(reffile) + applytransform!(refchain, align_to_membrane(refchain, reftms)) + + # Extract α-carbon coordinates at the conserved MSA columns. + # `msaseq` is the full aligned sequence (gaps as '-'); `chainidx` counts non-gap + # residues so that residues[chainidx] matches the current MSA position. + sentinel = fill(NaN, 3) + function alphacoords_at(chain, msaseq) + residues = collectresidues(chain) + coords = zeros(3, length(conserved)) + chainidx = outidx = 0 + for (i, r) in enumerate(msaseq) + r != '-' && (chainidx += 1) + if i in conserved_set + coords[:, outidx += 1] = r == '-' ? sentinel : alphacarbon_coordinates(residues[chainidx]) + end + end + return coords + end + + refcoords = alphacoords_at(refchain, msasequence(msa, refseqidx)) + any(isnan, refcoords) && error( + "$id_for_tms has gaps at conserved MSA columns; " * + "choose a different `id_for_tms` or adjust `conserved_params`" + ) + + # Step 6: align all structures to the reference and write to destdir + isdir(destdir) || mkdir(destdir) + for i in eachindex(msa) + name = uniprotX(identifier(msa[i])) + structfile = alphafoldfile(name, srcdir; join=true) + structfile === nothing && continue + c = getchain(structfile) + ccoords = alphacoords_at(c, msasequence(msa, i)) + keep = vec(.!any(isnan.(ccoords); dims=1)) + if !any(keep) + @warn "No conserved-column coordinates available for $name; skipping" + continue + end + applytransform!(c, Transformation(ccoords[:, keep], refcoords[:, keep])) + writechain(joinpath(destdir, "$name.cif"), c) + end + + return msa +end + ## Needleman-Wunsch alignment # This implements sequence alignment based on three-dimensional structure, where diff --git a/src/msa.jl b/src/msa.jl index 9d97aaa..862cabc 100644 --- a/src/msa.jl +++ b/src/msa.jl @@ -271,3 +271,19 @@ function Base.setindex!(fullseqvec::Vector, colvalues::AbstractVector, sm::Seque end return fullseqvec end + +""" + AlignFamilyParams(; gapfrac_max=0.2, maxentropy=0.2) + +Parameters controlling which MSA columns are used as structural alignment anchors in +[`align_family`](@ref). + +- `gapfrac_max`: columns where more than this fraction of sequences have a gap are + excluded (default 0.2). +- `maxentropy`: columns with entropy above this threshold are excluded (default 0.2). + Lower entropy means higher conservation. +""" +@kwdef struct AlignFamilyParams + gapfrac_max::Float64 = 0.2 + maxentropy::Float64 = 0.2 +end diff --git a/src/query.jl b/src/query.jl index 7e932fb..91790c1 100644 --- a/src/query.jl +++ b/src/query.jl @@ -16,7 +16,7 @@ You can also supply several proteins as a comma-separated list. `result` is a JSON3 object with many fields. """ function query_ebi_proteins(id::AbstractString; size=count(==(','), id)+1, format::Symbol=:json) - resp = HTTP.get("https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=$size&accession=$id", ["Accept" => formats[format]]) + resp = HTTP.get("https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=$size&accession=$id", ["Accept" => formats[format]]; status_exception=false) if resp.status == 200 return mktemp() do tmppath, tmpio body = String(resp.body) @@ -33,6 +33,37 @@ end query_ebi_proteins(id::AbstractVector{<:AbstractString}; kwargs...) = query_ebi_proteins(join(id, ','); kwargs...) +""" + ntms = query_tm_count(id) + ntms = query_tm_count(ids) + +Query the EBI Proteins API for the number of annotated transmembrane (TRANSMEM) segments. + +The single-accession form returns an `Int` (or `nothing` if the entry is not found). The +vector form returns a `Dict{String,Int}` mapping each returned accession code to its count. +Queries are batched at 50 IDs per request. + +Useful for choosing an `id_for_tms` for [`align_family`](@ref): a good reference receptor +should have its TM segments fully annotated (e.g., 7 for a canonical GPCR). +""" +function query_tm_count(ids::AbstractVector{<:AbstractString}; batchsize=50) + result = Dict{String,Int}() + for i in firstindex(ids):batchsize:lastindex(ids) + batch = ids[i:min(i+batchsize-1, lastindex(ids))] + response = query_ebi_proteins(batch) + response === nothing && continue + for r in response + result[String(r["accession"])] = count(f -> f["type"] == "TRANSMEM", r["features"]) + end + end + return result +end + +function query_tm_count(id::AbstractString) + result = query_tm_count([id]) + return get(result, id, nothing) +end + """ result = query_ncbi(id) diff --git a/src/utils.jl b/src/utils.jl index 5abb0b9..dd9996d 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -40,6 +40,23 @@ function validate_seq_residues(msaseq, chain) return true end +function validate_seq_residues(msaseq::AbstractString, chain) + chainiter = iterate(chain) + item, state = chainiter + for r in msaseq + (isgap(r) || isunknown(r)) && continue + res = three2char(String(resname(item))) + res == Char(r) || return false + chainiter = iterate(chain, state) + if chainiter !== nothing + item, state = chainiter + else + state = nothing # throws if this gets used again + end + end + return chainiter === nothing +end + function countitems(list) count = Dict{eltype(list),Int}() for item in list diff --git a/test/runtests.jl b/test/runtests.jl index 8cac0d4..8375643 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -423,6 +423,19 @@ using Test @test q["total_count"] == 1 end + @testset "TM counts" begin + # P29274 (ADORA2A_HUMAN) and P29275 (ADORA2B_HUMAN) are canonical GPCRs + # with 7 annotated TM segments in UniProt + ntms = query_tm_count(["P29274", "P29275"]) + @test get(ntms, "P29274", 0) == 7 + @test get(ntms, "P29275", 0) == 7 + + # Single-ID form returns an Int + @test query_tm_count("P29274") == 7 + @test query_tm_count("P02769") == 0 # BSA — cytosolic, no TMs + @test query_tm_count("notanid12345") === nothing # nonexistent ID + end + @testset "AlphaFold" begin @test try_download_alphafold("garbage") === nothing uname = "K7N608" @@ -537,5 +550,25 @@ using Test @test occursin("matchmaker #2-2 to #1", script) end end + @testset "align_family" begin + # Use two related human adenosine receptors (same subfamily, well annotated) + ids = ["P29274", "P29275"] + mktempdir() do dir + msafile = joinpath(dir, "msa.fasta") + srcdir = joinpath(dir, "src") + destdir = joinpath(dir, "dest") + msa = align_family(; ids, msafile, srcdir, destdir, id_for_tms="P29274") + @test length(msa) == 2 + @test isfile(msafile) # MSA was written + @test isfile(joinpath(destdir, "P29274.cif")) + @test isfile(joinpath(destdir, "P29275.cif")) + # Output files should be valid CIF structures + @test getchain(joinpath(destdir, "P29274.cif")) isa GPCRAnalysis.ChainLike + @test getchain(joinpath(destdir, "P29275.cif")) isa GPCRAnalysis.ChainLike + # Re-running without ids (msafile exists) should complete without errors + msa2 = align_family(; msafile, srcdir, destdir, id_for_tms="P29274") + @test length(msa2) == 2 + end + end end end