Skip to content

Commit 1e44b64

Browse files
committed
Refactor/bug fixes for merging -- DO NOT USE IN PRODUCTION YET
1 parent 8d56e65 commit 1e44b64

File tree

8 files changed

+1667
-299
lines changed

8 files changed

+1667
-299
lines changed

app/models/api/haplogroups/TreeMergeModels.scala

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,47 @@ object SplitOperation {
212212
implicit val format: OFormat[SplitOperation] = Json.format[SplitOperation]
213213
}
214214

215+
/**
216+
* Records an ambiguous or inconsistent placement during merge.
217+
*
218+
* Phylogenetics often involves uncertain placement. This flag alerts human
219+
* curators to branches where SNP data is contradictory, possibly due to:
220+
* - Sequencing errors in source data
221+
* - Recurrent mutations (homoplasy)
222+
* - Missing intermediate nodes
223+
* - Nomenclature mismatches between sources
224+
*
225+
* @param nodeName The node with ambiguous placement
226+
* @param ambiguityType Classification of the ambiguity
227+
* @param description Human-readable explanation
228+
* @param sharedVariants Variants that matched (overlap)
229+
* @param conflictingVariants Variants that conflict (present in one but not expected)
230+
* @param candidateMatches Other nodes that could have been matches
231+
* @param resolution How the algorithm resolved the ambiguity
232+
* @param confidence Score from 0.0 (very uncertain) to 1.0 (confident)
233+
*/
234+
case class PlacementAmbiguity(
235+
nodeName: String,
236+
ambiguityType: String,
237+
description: String,
238+
sharedVariants: List[String] = List.empty,
239+
conflictingVariants: List[String] = List.empty,
240+
candidateMatches: List[String] = List.empty,
241+
resolution: String,
242+
confidence: Double
243+
)
244+
245+
object PlacementAmbiguity {
246+
implicit val format: OFormat[PlacementAmbiguity] = Json.format[PlacementAmbiguity]
247+
248+
// Ambiguity type constants
249+
val PARTIAL_MATCH = "PARTIAL_MATCH" // Some SNPs match, others don't
250+
val MULTIPLE_CANDIDATES = "MULTIPLE_CANDIDATES" // Multiple nodes could be the match
251+
val RECURRENT_SNP = "RECURRENT_SNP" // SNP appears in multiple lineages
252+
val ORPHAN_PLACEMENT = "ORPHAN_PLACEMENT" // Node placed without strong variant evidence
253+
val NAME_VARIANT_MISMATCH = "NAME_VARIANT_MISMATCH" // Name matches but variants differ
254+
}
255+
215256
/**
216257
* Result of a merge operation.
217258
*/
@@ -221,6 +262,7 @@ case class TreeMergeResponse(
221262
statistics: MergeStatistics,
222263
conflicts: List[MergeConflict] = List.empty,
223264
splits: List[SplitOperation] = List.empty,
265+
ambiguities: List[PlacementAmbiguity] = List.empty,
224266
errors: List[String] = List.empty
225267
)
226268

@@ -243,6 +285,7 @@ case class MergePreviewResponse(
243285
statistics: MergeStatistics,
244286
conflicts: List[MergeConflict],
245287
splits: List[SplitOperation],
288+
ambiguities: List[PlacementAmbiguity],
246289
newNodes: List[String],
247290
updatedNodes: List[String],
248291
unchangedNodes: List[String]

app/repositories/HaplogroupCoreRepository.scala

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,38 @@ trait HaplogroupCoreRepository {
152152
* @return A Future containing an Option with the latest revision ID, or None if no relationships exist for the child.
153153
*/
154154
def getLatestRelationshipRevisionId(childHaplogroupId: Int): Future[Option[Int]]
155+
156+
// === Bulk Operations for Merge ===
157+
158+
/**
159+
* Create multiple haplogroups in a single batch operation.
160+
*
161+
* @param haplogroups The haplogroups to create
162+
* @return A Future containing a map from haplogroup name to newly assigned ID
163+
*/
164+
def createBatch(haplogroups: Seq[Haplogroup]): Future[Map[String, Int]]
165+
166+
/**
167+
* Create multiple parent-child relationships in a single batch operation.
168+
*
169+
* @param relationships Sequence of (childId, parentId, source) tuples
170+
* @return A Future containing the sequence of created relationship IDs
171+
*/
172+
def createRelationshipsBatch(relationships: Seq[(Int, Int, String)]): Future[Seq[Int]]
173+
174+
/**
175+
* Update provenance for multiple haplogroups in a single batch.
176+
*
177+
* @param updates Sequence of (haplogroupId, provenance) tuples
178+
* @return A Future containing the count of updated records
179+
*/
180+
def updateProvenanceBatch(updates: Seq[(Int, HaplogroupProvenance)]): Future[Int]
181+
182+
/**
183+
* Get all active parent-child relationships for a haplogroup type.
184+
* Returns (childId, parentId) tuples for building in-memory tree.
185+
*/
186+
def getAllRelationships(haplogroupType: HaplogroupType): Future[Seq[(Int, Int)]]
155187
}
156188

157189
class HaplogroupCoreRepositoryImpl @Inject()(
@@ -550,4 +582,60 @@ class HaplogroupCoreRepositoryImpl @Inject()(
550582
.result
551583
runQuery(query)
552584
}
585+
586+
// === Bulk Operations for Merge ===
587+
588+
override def createBatch(haplogroupsToCreate: Seq[Haplogroup]): Future[Map[String, Int]] = {
589+
if (haplogroupsToCreate.isEmpty) return Future.successful(Map.empty)
590+
591+
runQuery(
592+
(haplogroups returning haplogroups.map(h => (h.name, h.haplogroupId))) ++= haplogroupsToCreate
593+
).map(_.toMap)
594+
}
595+
596+
override def createRelationshipsBatch(relationships: Seq[(Int, Int, String)]): Future[Seq[Int]] = {
597+
if (relationships.isEmpty) return Future.successful(Seq.empty)
598+
599+
import models.domain.haplogroups.HaplogroupRelationship
600+
val now = LocalDateTime.now()
601+
602+
val relationshipsToInsert = relationships.map { case (childId, parentId, source) =>
603+
HaplogroupRelationship(
604+
id = None,
605+
childHaplogroupId = childId,
606+
parentHaplogroupId = parentId,
607+
revisionId = 1,
608+
validFrom = now,
609+
validUntil = None,
610+
source = source
611+
)
612+
}
613+
614+
runQuery(
615+
(haplogroupRelationships returning haplogroupRelationships.map(_.haplogroupRelationshipId)) ++= relationshipsToInsert
616+
)
617+
}
618+
619+
override def updateProvenanceBatch(updates: Seq[(Int, HaplogroupProvenance)]): Future[Int] = {
620+
if (updates.isEmpty) return Future.successful(0)
621+
622+
// Use DBIO.sequence for batch updates
623+
val updateActions = updates.map { case (id, provenance) =>
624+
haplogroups
625+
.filter(_.haplogroupId === id)
626+
.map(_.provenance)
627+
.update(Some(provenance))
628+
}
629+
630+
runTransactionally(DBIO.sequence(updateActions)).map(_.sum)
631+
}
632+
633+
override def getAllRelationships(haplogroupType: HaplogroupType): Future[Seq[(Int, Int)]] = {
634+
val query = for {
635+
rel <- activeRelationships
636+
child <- activeHaplogroups if child.haplogroupId === rel.childHaplogroupId && child.haplogroupType === haplogroupType
637+
} yield (rel.childHaplogroupId, rel.parentHaplogroupId)
638+
639+
runQuery(query.result)
640+
}
553641
}

app/repositories/HaplogroupVariantRepository.scala

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,32 @@ trait HaplogroupVariantRepository {
9696
* @return A Future containing a sequence of (haplogroupId, VariantV2) tuples.
9797
*/
9898
def getVariantsForHaplogroups(haplogroupIds: Seq[Int]): Future[Seq[(Int, VariantV2)]]
99+
100+
/**
101+
* Bulk associate variants with haplogroups in a single operation.
102+
* Uses ON CONFLICT to handle duplicates gracefully.
103+
*
104+
* @param associations Sequence of (haplogroupId, variantId) tuples
105+
* @return A Future containing the sequence of haplogroup_variant_ids created or found
106+
*/
107+
def bulkAddVariantsToHaplogroups(associations: Seq[(Int, Int)]): Future[Seq[Int]]
108+
109+
/**
110+
* Bulk remove variant associations from a haplogroup.
111+
*
112+
* @param haplogroupId The haplogroup to remove variants from
113+
* @param variantIds The variant IDs to remove
114+
* @return A Future containing the number of associations removed
115+
*/
116+
def bulkRemoveVariantsFromHaplogroup(haplogroupId: Int, variantIds: Seq[Int]): Future[Int]
117+
118+
/**
119+
* Gets the variant IDs (not haplogroup_variant_ids) for a haplogroup.
120+
*
121+
* @param haplogroupId The haplogroup ID
122+
* @return A Future containing the sequence of variant IDs
123+
*/
124+
def getVariantIdsForHaplogroup(haplogroupId: Int): Future[Seq[Int]]
99125
}
100126

101127
class HaplogroupVariantRepositoryImpl @Inject()(
@@ -298,4 +324,38 @@ class HaplogroupVariantRepositoryImpl @Inject()(
298324

299325
runQuery(query.result)
300326
}
327+
328+
override def bulkAddVariantsToHaplogroups(associations: Seq[(Int, Int)]): Future[Seq[Int]] = {
329+
if (associations.isEmpty) return Future.successful(Seq.empty)
330+
331+
// Build values clause for bulk insert
332+
val valuesClause = associations.map { case (hgId, varId) =>
333+
s"($hgId, $varId)"
334+
}.mkString(", ")
335+
336+
val insertQuery = sql"""
337+
INSERT INTO tree.haplogroup_variant (haplogroup_id, variant_id)
338+
VALUES #$valuesClause
339+
ON CONFLICT (haplogroup_id, variant_id) DO UPDATE
340+
SET haplogroup_id = EXCLUDED.haplogroup_id
341+
RETURNING haplogroup_variant_id
342+
""".as[Int]
343+
344+
runQuery(insertQuery)
345+
}
346+
347+
override def bulkRemoveVariantsFromHaplogroup(haplogroupId: Int, variantIds: Seq[Int]): Future[Int] = {
348+
if (variantIds.isEmpty) return Future.successful(0)
349+
350+
val query = haplogroupVariants
351+
.filter(hv => hv.haplogroupId === haplogroupId && hv.variantId.inSet(variantIds))
352+
.delete
353+
354+
runQuery(query)
355+
}
356+
357+
override def getVariantIdsForHaplogroup(haplogroupId: Int): Future[Seq[Int]] = {
358+
val query = haplogroupVariants.filter(_.haplogroupId === haplogroupId).map(_.variantId)
359+
runQuery(query.result)
360+
}
301361
}

app/repositories/VariantV2Repository.scala

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,13 @@ trait VariantV2Repository {
9898
def streamAll(): Future[Seq[VariantV2]]
9999
def findByIds(ids: Seq[Int]): Future[Seq[VariantV2]]
100100

101+
/**
102+
* Bulk search for variants by a list of names (canonical names or aliases).
103+
* Returns a map from search name (uppercased) to matching variants.
104+
* Much more efficient than individual searchByName calls for large batches.
105+
*/
106+
def searchByNames(names: Seq[String]): Future[Map[String, Seq[VariantV2]]]
107+
101108
// === DU Naming Authority ===
102109

103110
def nextDuName(): Future[String]
@@ -625,6 +632,44 @@ class VariantV2RepositoryImpl @Inject()(
625632
override def streamAll(): Future[Seq[VariantV2]] = db.run(variantsV2.result)
626633
override def findByIds(ids: Seq[Int]): Future[Seq[VariantV2]] = if (ids.isEmpty) Future.successful(Seq.empty) else db.run(variantsV2.filter(_.variantId.inSet(ids)).result)
627634

635+
override def searchByNames(names: Seq[String]): Future[Map[String, Seq[VariantV2]]] = {
636+
if (names.isEmpty) return Future.successful(Map.empty)
637+
638+
// Variant names are typically uppercase (M269, L21, etc.)
639+
val searchNames = names.flatMap(n => Seq(n, n.toUpperCase)).distinct
640+
val batchSize = 2000 // PostgreSQL handles IN/ANY efficiently up to several thousand
641+
642+
val batches = searchNames.grouped(batchSize).toSeq
643+
644+
// Process batches sequentially
645+
batches.foldLeft(Future.successful(Map.empty[String, Seq[VariantV2]])) { (accFuture, batch) =>
646+
accFuture.flatMap { acc =>
647+
val namesArray = batch.map(n => s"'${n.replace("'", "''")}'").mkString(",")
648+
val query = sql"""
649+
SELECT variant_id, canonical_name, mutation_type, naming_status,
650+
aliases, coordinates, defining_haplogroup_id, evidence,
651+
primers, notes, created_at, updated_at, annotations
652+
FROM variant_v2
653+
WHERE canonical_name = ANY(ARRAY[#$namesArray])
654+
""".as[VariantV2](variantV2GetResult)
655+
656+
db.run(query).map { variants =>
657+
val batchResults = variants.flatMap { v =>
658+
v.canonicalName.map(cn => cn.toUpperCase -> v)
659+
}.groupMap(_._1)(_._2)
660+
661+
// Merge into accumulator
662+
batchResults.foldLeft(acc) { case (map, (name, vars)) =>
663+
map.updatedWith(name) {
664+
case Some(existing) => Some(existing ++ vars)
665+
case None => Some(vars)
666+
}
667+
}
668+
}
669+
}
670+
}
671+
}
672+
628673
// === DU Naming Authority ===
629674

630675
private val DuNamePattern = "^DU[1-9][0-9]*$".r

0 commit comments

Comments
 (0)