Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953"
JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
MUSCLE_jll = "0d2b832c-22e5-5080-b34f-49cc6141c4c5"
MultivariateStats = "6f286f6a-111f-5878-ab1e-185364afe411"
MutableConvexHulls = "948c7aac-0e5e-4631-af23-7a6bb7a17825"
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
Expand Down Expand Up @@ -47,7 +48,7 @@ Downloads = "1"
FASTX = "2"
FixedPointNumbers = "0.8"
GZip = "0.6"
GaussianMixtureAlignment = "0.2.2, 0.3"
GaussianMixtureAlignment = "0.2.2, 0.3, 0.4"
HTTP = "1"
HiGHS = "1"
Hungarian = "0.6, 0.7"
Expand All @@ -57,6 +58,7 @@ JSON3 = "1"
JuMP = "1"
LinearAlgebra = "1"
MIToS = "3"
MUSCLE_jll = "5.2.0"
MultivariateStats = "0.9, 0.10"
MutableConvexHulls = "0.2"
OffsetArrays = "1"
Expand Down
7 changes: 5 additions & 2 deletions src/GPCRAnalysis.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ using GZip
using FixedPointNumbers
using ColorTypes

# MUSCLE
using MUSCLE_jll

const ChainLike = Union{Chain, AbstractVector{<:AbstractResidue}} # an iterable of residues
const ResidueLike = Union{Residue, AbstractVector{<:AbstractAtom}} # an iterable of atoms
const StructureLike = Union{ChainLike, Model, MolecularStructure}
Expand All @@ -39,10 +42,10 @@ const StructureLike = Union{ChainLike, Model, MolecularStructure}

export SequenceMapping, AccessionCode, MSACode, NWGapCosts
export sequenceindexes, columnindexes, isgap, isunknown, sequencekeys, msasequence, residuematrix, subseqs, subseqs!
export species, uniprotX, query_uniprot_accession, query_ebi_proteins, query_ncbi
export species, uniprotX, query_uniprot_accession, query_ebi_proteins, query_ncbi, query_tm_count
export try_download_alphafold, query_alphafold_latest, download_alphafolds, alphafoldfile, alphafoldfiles, getchain,
writechain, findall_subseq, pLDDT, pLDDTcolor
export align_to_axes, align_to_membrane, align_nw, align_ranges, map_closest, align_closest
export align_to_axes, align_to_membrane, align_nw, align_ranges, map_closest, align_closest, align_muscle, align_family
export filter_species!, filter_long!, sortperm_msa, chimerax_script
export project_sequences, columnwise_entropy, align, residue_centroid, residue_centroid_matrix, alphacarbon_coordinates,
alphacarbon_coordinates_matrix, chargelocations, positive_locations, negative_locations
Expand Down
168 changes: 165 additions & 3 deletions src/align.jl
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,8 @@ end

Compute the rigid transformation `tform` needed to align `chain` to the
membrane, given the transmembrane segments `tms` as residue-indexes (e.g.,
`[37:61, 74:96, ...]`). `extracelluar` should be true if the N-terminus of the
protein is extracellular (`chain[first(tms[1])]` is at the extracellular face),
and false otherwise.
`[37:61, 74:96, ...]`). `extracellular` should be true if the N-terminus of the
chain is extracellular, and false otherwise.

`applytransform!(chain, tform)` (or the `model` that includes `chain`) will
re-orient `chain` so that the center of the membrane is `z=0` and extracellular
Expand Down Expand Up @@ -169,6 +168,169 @@ function collect_leaflet_residues(chain, tms, extracellular::Bool)
return leaflet_e, leaflet_i
end

## Alignment using MUSCLE
# https://github.com/rcedgar/muscle

"""
align_muscle(infile::AbstractString, outfile::AbstractString)

Run [MUSCLE](https://github.com/rcedgar/muscle) to align the sequences in
`infile` and write the aligned sequences to `outfile`. Both files should be in
FASTA format.

# Examples

Let's get the sequences of two receptors and then align them with MUSCLE:

```
julia> open("infile.fasta", "w") do io
write(io, query_ebi_proteins(["Q8R256", "E9Q8I6"]; format=:fasta))
end

julia> align_muscle("infile.fasta", "outfile.fasta")
```
"""
align_muscle(infile::AbstractString, outfile::AbstractString) = run(`$(muscle()) -align $infile -output $outfile`)

"""
msa = align_family(; ids=nothing, msafile, srcdir, destdir, id_for_tms,
conserved_params=AlignFamilyParams())

Download and structurally align AlphaFold models for a family of receptors.

# Arguments
- `msafile`: path to the FASTA MSA file. This anchors the workflow: if the file does not
exist yet, `ids` must be supplied to create it; if it already exists (e.g. when
resuming an interrupted run), it is read directly and `ids` is not needed.
- `ids`: a vector of UniProt accession codes, required only to create `msafile`.
Sequences are fetched from EBI in batches of 50 and aligned with MUSCLE.
- `srcdir`: directory for downloaded AlphaFold structure files.
- `destdir`: directory for the aligned output CIF files.
- `id_for_tms`: UniProt accession of the reference receptor. Must appear in the MSA and
have an AlphaFold structure. Its TRANSMEM annotations from EBI are used to orient all
structures to the membrane normal. Use [`query_tm_count`](@ref) to find an accession
with a complete TM annotation.
- `conserved_params`: an [`AlignFamilyParams`](@ref) controlling which MSA columns serve
as alignment anchors.

# Returns
The MSA as a `Vector{FASTX.FASTA.Record}`.

# Steps
1. Create the MSA from `ids` if `msafile` does not already exist.
2. Fetch TRANSMEM annotations for `id_for_tms` from EBI.
3. Download AlphaFold structures for all MSA sequences into `srcdir`.
4. Identify conserved, low-gap MSA columns as structural alignment anchors.
5. Align the reference structure to the membrane normal.
6. Align every structure to the reference using α-carbon positions at the conserved
columns, and write the results to `destdir`.
"""
function align_family(;
ids=nothing,
msafile,
srcdir,
destdir,
id_for_tms,
conserved_params=AlignFamilyParams(),
)
# Step 1: create MSA from ids if msafile doesn't exist yet
if !isfile(msafile)
ids !== nothing || throw(ArgumentError(
"`msafile` \"$msafile\" does not exist; supply `ids` to create it"
))
mktemp() do seqfile, seqio
for i in firstindex(ids):50:lastindex(ids)
result = query_ebi_proteins(ids[i:min(i+49, lastindex(ids))]; format=:fasta)
result === nothing && continue
write(seqio, result)
end
close(seqio)
align_muscle(seqfile, msafile)
end
end

msa = FASTAReader(open(msafile)) do io
collect(io)
end

# Step 2: fetch TM segment annotations for the reference
ref_entry = query_ebi_proteins(id_for_tms)
(ref_entry === nothing || isempty(ref_entry)) && error("EBI query failed for $id_for_tms")
reftms = [parse(Int, f["begin"]):parse(Int, f["end"])
for f in only(ref_entry).features
if f["type"] == "TRANSMEM"]
isempty(reftms) && error("No TRANSMEM features found for $id_for_tms; try a different `id_for_tms`")
length(reftms) == 7 || error("Need 7 TM segments for $id_for_tms, but found $(length(reftms)); try a different `id_for_tms`")

# Step 3: download AlphaFold structures
isdir(srcdir) || mkdir(srcdir)
download_alphafolds(msa; dirname=srcdir)

# Step 4: identify conserved, low-gap MSA columns as alignment anchors
# columnindexes filters out all-gap columns; map conserved back to full-sequence positions
colidxs = columnindexes(msa)
resmtrx = residuematrix(msa) # columns correspond to colidxs
colentropy = columnwise_entropy(msa)
gapfrac = vec(mean(resmtrx .== '-', dims=1))
ce = [gapfrac[j] > conserved_params.gapfrac_max ? Inf : colentropy[j]
for j in eachindex(colentropy)]
conserved = colidxs[findall(ce .< conserved_params.maxentropy)]
conserved_set = Set(conserved)

# Find the reference sequence in the MSA
msaidx = Dict(uniprotX(identifier(msa[i])) => i for i in eachindex(msa))
refseqidx = get(msaidx, id_for_tms, nothing)
refseqidx === nothing && error("$id_for_tms not found in MSA ($msafile)")

# Step 5: align reference structure to membrane
reffile = alphafoldfile(id_for_tms, srcdir; join=true)
reffile === nothing && error("AlphaFold structure for $id_for_tms not found in $srcdir")
refchain = getchain(reffile)
applytransform!(refchain, align_to_membrane(refchain, reftms))

# Extract α-carbon coordinates at the conserved MSA columns.
# `msaseq` is the full aligned sequence (gaps as '-'); `chainidx` counts non-gap
# residues so that residues[chainidx] matches the current MSA position.
sentinel = fill(NaN, 3)
function alphacoords_at(chain, msaseq)
residues = collectresidues(chain)
coords = zeros(3, length(conserved))
chainidx = outidx = 0
for (i, r) in enumerate(msaseq)
r != '-' && (chainidx += 1)
if i in conserved_set
coords[:, outidx += 1] = r == '-' ? sentinel : alphacarbon_coordinates(residues[chainidx])
end
end
return coords
end

refcoords = alphacoords_at(refchain, msasequence(msa, refseqidx))
any(isnan, refcoords) && error(
"$id_for_tms has gaps at conserved MSA columns; " *
"choose a different `id_for_tms` or adjust `conserved_params`"
)

# Step 6: align all structures to the reference and write to destdir
isdir(destdir) || mkdir(destdir)
for i in eachindex(msa)
name = uniprotX(identifier(msa[i]))
structfile = alphafoldfile(name, srcdir; join=true)
structfile === nothing && continue
c = getchain(structfile)
ccoords = alphacoords_at(c, msasequence(msa, i))
keep = vec(.!any(isnan.(ccoords); dims=1))
if !any(keep)
@warn "No conserved-column coordinates available for $name; skipping"
continue
end
applytransform!(c, Transformation(ccoords[:, keep], refcoords[:, keep]))
writechain(joinpath(destdir, "$name.cif"), c)
end

return msa
end

## Needleman-Wunsch alignment

# This implements sequence alignment based on three-dimensional structure, where
Expand Down
16 changes: 16 additions & 0 deletions src/msa.jl
Original file line number Diff line number Diff line change
Expand Up @@ -271,3 +271,19 @@ function Base.setindex!(fullseqvec::Vector, colvalues::AbstractVector, sm::Seque
end
return fullseqvec
end

"""
AlignFamilyParams(; gapfrac_max=0.2, maxentropy=0.2)

Parameters controlling which MSA columns are used as structural alignment anchors in
[`align_family`](@ref).

- `gapfrac_max`: columns where more than this fraction of sequences have a gap are
excluded (default 0.2).
- `maxentropy`: columns with entropy above this threshold are excluded (default 0.2).
Lower entropy means higher conservation.
"""
@kwdef struct AlignFamilyParams
gapfrac_max::Float64 = 0.2
maxentropy::Float64 = 0.2
end
33 changes: 32 additions & 1 deletion src/query.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ You can also supply several proteins as a comma-separated list.
`result` is a JSON3 object with many fields.
"""
function query_ebi_proteins(id::AbstractString; size=count(==(','), id)+1, format::Symbol=:json)
resp = HTTP.get("https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=$size&accession=$id", ["Accept" => formats[format]])
resp = HTTP.get("https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=$size&accession=$id", ["Accept" => formats[format]]; status_exception=false)
if resp.status == 200
return mktemp() do tmppath, tmpio
body = String(resp.body)
Expand All @@ -33,6 +33,37 @@ end
query_ebi_proteins(id::AbstractVector{<:AbstractString}; kwargs...) =
query_ebi_proteins(join(id, ','); kwargs...)

"""
ntms = query_tm_count(id)
ntms = query_tm_count(ids)

Query the EBI Proteins API for the number of annotated transmembrane (TRANSMEM) segments.

The single-accession form returns an `Int` (or `nothing` if the entry is not found). The
vector form returns a `Dict{String,Int}` mapping each returned accession code to its count.
Queries are batched at 50 IDs per request.

Useful for choosing an `id_for_tms` for [`align_family`](@ref): a good reference receptor
should have its TM segments fully annotated (e.g., 7 for a canonical GPCR).
"""
function query_tm_count(ids::AbstractVector{<:AbstractString}; batchsize=50)
result = Dict{String,Int}()
for i in firstindex(ids):batchsize:lastindex(ids)
batch = ids[i:min(i+batchsize-1, lastindex(ids))]
response = query_ebi_proteins(batch)
response === nothing && continue
for r in response
result[String(r["accession"])] = count(f -> f["type"] == "TRANSMEM", r["features"])
end
end
return result
end

function query_tm_count(id::AbstractString)
result = query_tm_count([id])
return get(result, id, nothing)
end

"""
result = query_ncbi(id)

Expand Down
17 changes: 17 additions & 0 deletions src/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,23 @@ function validate_seq_residues(msaseq, chain)
return true
end

function validate_seq_residues(msaseq::AbstractString, chain)
chainiter = iterate(chain)
item, state = chainiter
for r in msaseq
(isgap(r) || isunknown(r)) && continue
res = three2char(String(resname(item)))
res == Char(r) || return false
chainiter = iterate(chain, state)
if chainiter !== nothing
item, state = chainiter
else
state = nothing # throws if this gets used again
end
end
return chainiter === nothing
end

function countitems(list)
count = Dict{eltype(list),Int}()
for item in list
Expand Down
33 changes: 33 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,19 @@ using Test
@test q["total_count"] == 1
end

@testset "TM counts" begin
# P29274 (ADORA2A_HUMAN) and P29275 (ADORA2B_HUMAN) are canonical GPCRs
# with 7 annotated TM segments in UniProt
ntms = query_tm_count(["P29274", "P29275"])
@test get(ntms, "P29274", 0) == 7
@test get(ntms, "P29275", 0) == 7

# Single-ID form returns an Int
@test query_tm_count("P29274") == 7
@test query_tm_count("P02769") == 0 # BSA — cytosolic, no TMs
@test query_tm_count("notanid12345") === nothing # nonexistent ID
end

@testset "AlphaFold" begin
@test try_download_alphafold("garbage") === nothing
uname = "K7N608"
Expand Down Expand Up @@ -537,5 +550,25 @@ using Test
@test occursin("matchmaker #2-2 to #1", script)
end
end
@testset "align_family" begin
# Use two related human adenosine receptors (same subfamily, well annotated)
ids = ["P29274", "P29275"]
mktempdir() do dir
msafile = joinpath(dir, "msa.fasta")
srcdir = joinpath(dir, "src")
destdir = joinpath(dir, "dest")
msa = align_family(; ids, msafile, srcdir, destdir, id_for_tms="P29274")
@test length(msa) == 2
@test isfile(msafile) # MSA was written
@test isfile(joinpath(destdir, "P29274.cif"))
@test isfile(joinpath(destdir, "P29275.cif"))
# Output files should be valid CIF structures
@test getchain(joinpath(destdir, "P29274.cif")) isa GPCRAnalysis.ChainLike
@test getchain(joinpath(destdir, "P29275.cif")) isa GPCRAnalysis.ChainLike
# Re-running without ids (msafile exists) should complete without errors
msa2 = align_family(; msafile, srcdir, destdir, id_for_tms="P29274")
@test length(msa2) == 2
end
end
end
end
Loading