HolyLab · timholy · Mar 18, 2026 · Mar 18, 2026
diff --git a/Project.toml b/Project.toml
@@ -18,6 +18,7 @@ Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
 IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953"
 JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+MUSCLE_jll = "0d2b832c-22e5-5080-b34f-49cc6141c4c5"
 MultivariateStats = "6f286f6a-111f-5878-ab1e-185364afe411"
 MutableConvexHulls = "948c7aac-0e5e-4631-af23-7a6bb7a17825"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
@@ -47,7 +48,7 @@ Downloads = "1"
 FASTX = "2"
 FixedPointNumbers = "0.8"
 GZip = "0.6"
-GaussianMixtureAlignment = "0.2.2, 0.3"
+GaussianMixtureAlignment = "0.2.2, 0.3, 0.4"
 HTTP = "1"
 HiGHS = "1"
 Hungarian = "0.6, 0.7"
@@ -57,6 +58,7 @@ JSON3 = "1"
 JuMP = "1"
 LinearAlgebra = "1"
 MIToS = "3"
+MUSCLE_jll = "5.2.0"
 MultivariateStats = "0.9, 0.10"
 MutableConvexHulls = "0.2"
 OffsetArrays = "1"

diff --git a/src/GPCRAnalysis.jl b/src/GPCRAnalysis.jl
@@ -31,6 +31,9 @@ using GZip
 using FixedPointNumbers
 using ColorTypes
 
+# MUSCLE
+using MUSCLE_jll
+
 const ChainLike = Union{Chain, AbstractVector{<:AbstractResidue}}   # an iterable of residues
 const ResidueLike = Union{Residue, AbstractVector{<:AbstractAtom}}  # an iterable of atoms
 const StructureLike = Union{ChainLike, Model, MolecularStructure}
@@ -39,10 +42,10 @@ const StructureLike = Union{ChainLike, Model, MolecularStructure}
 
 export SequenceMapping, AccessionCode, MSACode, NWGapCosts
 export sequenceindexes, columnindexes, isgap, isunknown, sequencekeys, msasequence, residuematrix, subseqs, subseqs!
-export species, uniprotX, query_uniprot_accession, query_ebi_proteins, query_ncbi
+export species, uniprotX, query_uniprot_accession, query_ebi_proteins, query_ncbi, query_tm_count
 export try_download_alphafold, query_alphafold_latest, download_alphafolds, alphafoldfile, alphafoldfiles, getchain,
        writechain, findall_subseq, pLDDT, pLDDTcolor
-export align_to_axes, align_to_membrane, align_nw, align_ranges, map_closest, align_closest
+export align_to_axes, align_to_membrane, align_nw, align_ranges, map_closest, align_closest, align_muscle, align_family
 export filter_species!, filter_long!, sortperm_msa, chimerax_script
 export project_sequences, columnwise_entropy, align, residue_centroid, residue_centroid_matrix, alphacarbon_coordinates,
        alphacarbon_coordinates_matrix, chargelocations, positive_locations, negative_locations

diff --git a/src/align.jl b/src/align.jl
@@ -82,9 +82,8 @@ end
 
 Compute the rigid transformation `tform` needed to align `chain` to the
 membrane, given the transmembrane segments `tms` as residue-indexes (e.g.,
-`[37:61, 74:96, ...]`). `extracelluar` should be true if the N-terminus of the
-protein is extracellular (`chain[first(tms[1])]` is at the extracellular face),
-and false otherwise.
+`[37:61, 74:96, ...]`). `extracellular` should be true if the N-terminus of the
+chain is extracellular, and false otherwise.
 
 `applytransform!(chain, tform)` (or the `model` that includes `chain`) will
 re-orient `chain` so that the center of the membrane is `z=0` and extracellular
@@ -169,6 +168,169 @@ function collect_leaflet_residues(chain, tms, extracellular::Bool)
     return leaflet_e, leaflet_i
 end
 
+## Alignment using MUSCLE
+# https://github.com/rcedgar/muscle
+
+"""
+    align_muscle(infile::AbstractString, outfile::AbstractString)
+
+Run [MUSCLE](https://github.com/rcedgar/muscle) to align the sequences in
+`infile` and write the aligned sequences to `outfile`. Both files should be in
+FASTA format.
+
+# Examples
+
+Let's get the sequences of two receptors and then align them with MUSCLE:
+
+```
+julia> open("infile.fasta", "w") do io
+    write(io, query_ebi_proteins(["Q8R256", "E9Q8I6"]; format=:fasta))
+end
+
+julia> align_muscle("infile.fasta", "outfile.fasta")
+```
+"""
+align_muscle(infile::AbstractString, outfile::AbstractString) = run(`$(muscle()) -align $infile -output $outfile`)
+
+"""
+    msa = align_family(; ids=nothing, msafile, srcdir, destdir, id_for_tms,
+                         conserved_params=AlignFamilyParams())
+
+Download and structurally align AlphaFold models for a family of receptors.
+
+# Arguments
+- `msafile`: path to the FASTA MSA file. This anchors the workflow: if the file does not
+  exist yet, `ids` must be supplied to create it; if it already exists (e.g. when
+  resuming an interrupted run), it is read directly and `ids` is not needed.
+- `ids`: a vector of UniProt accession codes, required only to create `msafile`.
+  Sequences are fetched from EBI in batches of 50 and aligned with MUSCLE.
+- `srcdir`: directory for downloaded AlphaFold structure files.
+- `destdir`: directory for the aligned output CIF files.
+- `id_for_tms`: UniProt accession of the reference receptor. Must appear in the MSA and
+  have an AlphaFold structure. Its TRANSMEM annotations from EBI are used to orient all
+  structures to the membrane normal. Use [`query_tm_count`](@ref) to find an accession
+  with a complete TM annotation.
+- `conserved_params`: an [`AlignFamilyParams`](@ref) controlling which MSA columns serve
+  as alignment anchors.
+
+# Returns
+The MSA as a `Vector{FASTX.FASTA.Record}`.
+
+# Steps
+1. Create the MSA from `ids` if `msafile` does not already exist.
+2. Fetch TRANSMEM annotations for `id_for_tms` from EBI.
+3. Download AlphaFold structures for all MSA sequences into `srcdir`.
+4. Identify conserved, low-gap MSA columns as structural alignment anchors.
+5. Align the reference structure to the membrane normal.
+6. Align every structure to the reference using α-carbon positions at the conserved
+   columns, and write the results to `destdir`.
+"""
+function align_family(;
+        ids=nothing,
+        msafile,
+        srcdir,
+        destdir,
+        id_for_tms,
+        conserved_params=AlignFamilyParams(),
+    )
+    # Step 1: create MSA from ids if msafile doesn't exist yet
+    if !isfile(msafile)
+        ids !== nothing || throw(ArgumentError(
+            "`msafile` \"$msafile\" does not exist; supply `ids` to create it"
+        ))
+        mktemp() do seqfile, seqio
+            for i in firstindex(ids):50:lastindex(ids)
+                result = query_ebi_proteins(ids[i:min(i+49, lastindex(ids))]; format=:fasta)
+                result === nothing && continue
+                write(seqio, result)
+            end
+            close(seqio)
+            align_muscle(seqfile, msafile)
+        end
+    end
+
+    msa = FASTAReader(open(msafile)) do io
+        collect(io)
+    end
+
+    # Step 2: fetch TM segment annotations for the reference
+    ref_entry = query_ebi_proteins(id_for_tms)
+    (ref_entry === nothing || isempty(ref_entry)) && error("EBI query failed for $id_for_tms")
+    reftms = [parse(Int, f["begin"]):parse(Int, f["end"])
+              for f in only(ref_entry).features
+              if f["type"] == "TRANSMEM"]
+    isempty(reftms) && error("No TRANSMEM features found for $id_for_tms; try a different `id_for_tms`")
+    length(reftms) == 7 || error("Need 7 TM segments for $id_for_tms, but found $(length(reftms)); try a different `id_for_tms`")
+
+    # Step 3: download AlphaFold structures
+    isdir(srcdir) || mkdir(srcdir)
+    download_alphafolds(msa; dirname=srcdir)
+
+    # Step 4: identify conserved, low-gap MSA columns as alignment anchors
+    # columnindexes filters out all-gap columns; map conserved back to full-sequence positions
+    colidxs = columnindexes(msa)
+    resmtrx = residuematrix(msa)   # columns correspond to colidxs
+    colentropy = columnwise_entropy(msa)
+    gapfrac = vec(mean(resmtrx .== '-', dims=1))
+    ce = [gapfrac[j] > conserved_params.gapfrac_max ? Inf : colentropy[j]
+          for j in eachindex(colentropy)]
+    conserved = colidxs[findall(ce .< conserved_params.maxentropy)]
+    conserved_set = Set(conserved)
+
+    # Find the reference sequence in the MSA
+    msaidx = Dict(uniprotX(identifier(msa[i])) => i for i in eachindex(msa))
+    refseqidx = get(msaidx, id_for_tms, nothing)
+    refseqidx === nothing && error("$id_for_tms not found in MSA ($msafile)")
+
+    # Step 5: align reference structure to membrane
+    reffile = alphafoldfile(id_for_tms, srcdir; join=true)
+    reffile === nothing && error("AlphaFold structure for $id_for_tms not found in $srcdir")
+    refchain = getchain(reffile)
+    applytransform!(refchain, align_to_membrane(refchain, reftms))
+
+    # Extract α-carbon coordinates at the conserved MSA columns.
+    # `msaseq` is the full aligned sequence (gaps as '-'); `chainidx` counts non-gap
+    # residues so that residues[chainidx] matches the current MSA position.
+    sentinel = fill(NaN, 3)
+    function alphacoords_at(chain, msaseq)
+        residues = collectresidues(chain)
+        coords = zeros(3, length(conserved))
+        chainidx = outidx = 0
+        for (i, r) in enumerate(msaseq)
+            r != '-' && (chainidx += 1)
+            if i in conserved_set
+                coords[:, outidx += 1] = r == '-' ? sentinel : alphacarbon_coordinates(residues[chainidx])
+            end
+        end
+        return coords
+    end
+
+    refcoords = alphacoords_at(refchain, msasequence(msa, refseqidx))
+    any(isnan, refcoords) && error(
+        "$id_for_tms has gaps at conserved MSA columns; " *
+        "choose a different `id_for_tms` or adjust `conserved_params`"
+    )
+
+    # Step 6: align all structures to the reference and write to destdir
+    isdir(destdir) || mkdir(destdir)
+    for i in eachindex(msa)
+        name = uniprotX(identifier(msa[i]))
+        structfile = alphafoldfile(name, srcdir; join=true)
+        structfile === nothing && continue
+        c = getchain(structfile)
+        ccoords = alphacoords_at(c, msasequence(msa, i))
+        keep = vec(.!any(isnan.(ccoords); dims=1))
+        if !any(keep)
+            @warn "No conserved-column coordinates available for $name; skipping"
+            continue
+        end
+        applytransform!(c, Transformation(ccoords[:, keep], refcoords[:, keep]))
+        writechain(joinpath(destdir, "$name.cif"), c)
+    end
+
+    return msa
+end
+
 ## Needleman-Wunsch alignment
 
 # This implements sequence alignment based on three-dimensional structure, where

diff --git a/src/msa.jl b/src/msa.jl
@@ -271,3 +271,19 @@ function Base.setindex!(fullseqvec::Vector, colvalues::AbstractVector, sm::Seque
     end
     return fullseqvec
 end
+
+"""
+    AlignFamilyParams(; gapfrac_max=0.2, maxentropy=0.2)
+
+Parameters controlling which MSA columns are used as structural alignment anchors in
+[`align_family`](@ref).
+
+- `gapfrac_max`: columns where more than this fraction of sequences have a gap are
+  excluded (default 0.2).
+- `maxentropy`: columns with entropy above this threshold are excluded (default 0.2).
+  Lower entropy means higher conservation.
+"""
+@kwdef struct AlignFamilyParams
+    gapfrac_max::Float64 = 0.2
+    maxentropy::Float64 = 0.2
+end
diff --git a/src/query.jl b/src/query.jl
@@ -16,7 +16,7 @@ You can also supply several proteins as a comma-separated list.
 `result` is a JSON3 object with many fields.
 """
 function query_ebi_proteins(id::AbstractString; size=count(==(','), id)+1, format::Symbol=:json)
-    resp = HTTP.get("https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=$size&accession=$id", ["Accept" => formats[format]])
+    resp = HTTP.get("https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=$size&accession=$id", ["Accept" => formats[format]]; status_exception=false)
     if resp.status == 200
         return mktemp() do tmppath, tmpio
             body = String(resp.body)
@@ -33,6 +33,37 @@ end
 query_ebi_proteins(id::AbstractVector{<:AbstractString}; kwargs...) =
     query_ebi_proteins(join(id, ','); kwargs...)
 
+"""
+    ntms = query_tm_count(id)
+    ntms = query_tm_count(ids)
+
+Query the EBI Proteins API for the number of annotated transmembrane (TRANSMEM) segments.
+
+The single-accession form returns an `Int` (or `nothing` if the entry is not found). The
+vector form returns a `Dict{String,Int}` mapping each returned accession code to its count.
+Queries are batched at 50 IDs per request.
+
+Useful for choosing an `id_for_tms` for [`align_family`](@ref): a good reference receptor
+should have its TM segments fully annotated (e.g., 7 for a canonical GPCR).
+"""
+function query_tm_count(ids::AbstractVector{<:AbstractString}; batchsize=50)
+    result = Dict{String,Int}()
+    for i in firstindex(ids):batchsize:lastindex(ids)
+        batch = ids[i:min(i+batchsize-1, lastindex(ids))]
+        response = query_ebi_proteins(batch)
+        response === nothing && continue
+        for r in response
+            result[String(r["accession"])] = count(f -> f["type"] == "TRANSMEM", r["features"])
+        end
+    end
+    return result
+end
+
+function query_tm_count(id::AbstractString)
+    result = query_tm_count([id])
+    return get(result, id, nothing)
+end
+
 """
     result = query_ncbi(id)
 

diff --git a/src/utils.jl b/src/utils.jl
@@ -40,6 +40,23 @@ function validate_seq_residues(msaseq, chain)
     return true
 end
 
+function validate_seq_residues(msaseq::AbstractString, chain)
+    chainiter = iterate(chain)
+    item, state = chainiter
+    for r in msaseq
+        (isgap(r) || isunknown(r)) && continue
+        res = three2char(String(resname(item)))
+        res == Char(r) || return false
+        chainiter = iterate(chain, state)
+        if chainiter !== nothing
+            item, state = chainiter
+        else
+            state = nothing # throws if this gets used again
+        end
+    end
+    return chainiter === nothing
+end
+
 function countitems(list)
     count = Dict{eltype(list),Int}()
     for item in list

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -423,6 +423,19 @@ using Test
             @test q["total_count"] == 1
         end
 
+        @testset "TM counts" begin
+            # P29274 (ADORA2A_HUMAN) and P29275 (ADORA2B_HUMAN) are canonical GPCRs
+            # with 7 annotated TM segments in UniProt
+            ntms = query_tm_count(["P29274", "P29275"])
+            @test get(ntms, "P29274", 0) == 7
+            @test get(ntms, "P29275", 0) == 7
+
+            # Single-ID form returns an Int
+            @test query_tm_count("P29274") == 7
+            @test query_tm_count("P02769") == 0          # BSA — cytosolic, no TMs
+            @test query_tm_count("notanid12345") === nothing  # nonexistent ID
+        end
+
         @testset "AlphaFold" begin
             @test try_download_alphafold("garbage") === nothing
             uname = "K7N608"
@@ -537,5 +550,25 @@ using Test
                 @test occursin("matchmaker #2-2 to #1", script)
             end
         end
+        @testset "align_family" begin
+            # Use two related human adenosine receptors (same subfamily, well annotated)
+            ids = ["P29274", "P29275"]
+            mktempdir() do dir
+                msafile = joinpath(dir, "msa.fasta")
+                srcdir  = joinpath(dir, "src")
+                destdir = joinpath(dir, "dest")
+                msa = align_family(; ids, msafile, srcdir, destdir, id_for_tms="P29274")
+                @test length(msa) == 2
+                @test isfile(msafile)            # MSA was written
+                @test isfile(joinpath(destdir, "P29274.cif"))
+                @test isfile(joinpath(destdir, "P29275.cif"))
+                # Output files should be valid CIF structures
+                @test getchain(joinpath(destdir, "P29274.cif")) isa GPCRAnalysis.ChainLike
+                @test getchain(joinpath(destdir, "P29275.cif")) isa GPCRAnalysis.ChainLike
+                # Re-running without ids (msafile exists) should complete without errors
+                msa2 = align_family(; msafile, srcdir, destdir, id_for_tms="P29274")
+                @test length(msa2) == 2
+            end
+        end
     end
 end