diff --git a/src/main/kotlin/biokotlin/gff/Chromosome.kt b/src/main/kotlin/biokotlin/gff/Chromosome.kt new file mode 100644 index 00000000..18af4181 --- /dev/null +++ b/src/main/kotlin/biokotlin/gff/Chromosome.kt @@ -0,0 +1,14 @@ +package biokotlin.gff + +class Chromosome ( + seqid: String, + source: String, + start: Int, + end: Int, + attributes: Map = emptyMap(), + children: List = emptyList() +) : Feature(seqid, source, start, end, attributes = attributes, children = children) { + + override fun type(): FeatureType = FeatureType.Chromosome + +} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/Coding.kt b/src/main/kotlin/biokotlin/gff/Coding.kt new file mode 100644 index 00000000..21d692c0 --- /dev/null +++ b/src/main/kotlin/biokotlin/gff/Coding.kt @@ -0,0 +1,20 @@ +package biokotlin.gff + +/** + * Also known as CDS + */ +class Coding( + seqid: String, + source: String, + start: Int, + end: Int, + score: Double = Double.NaN, + strand: String = "+", + phase: String = ".", + attributes: Map = emptyMap(), + children: List = emptyList() +) : Feature(seqid, source, start, end, score, strand, phase, attributes, children) { + + override fun type(): FeatureType = FeatureType.Coding + +} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/Exon.kt b/src/main/kotlin/biokotlin/gff/Exon.kt new file mode 100644 index 00000000..65e9d1f4 --- /dev/null +++ b/src/main/kotlin/biokotlin/gff/Exon.kt @@ -0,0 +1,29 @@ +package biokotlin.gff + +class Exon( + seqid: String, + source: String, + start: Int, + end: Int, + score: Double = Double.NaN, + strand: String = "+", + phase: String = ".", + attributes: Map = emptyMap(), + children: List = emptyList() +) : Feature(seqid, source, start, end, score, strand, phase, attributes, children) { + + override fun type(): FeatureType = FeatureType.Exon + + fun coding(): List { + return children.filterIsInstance().toList() + } + + fun leaders(): List { + return children.filterIsInstance().toList() + } + + fun terminators(): List { + return children.filterIsInstance().toList() + } + +} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/Feature.kt b/src/main/kotlin/biokotlin/gff/Feature.kt new file mode 100644 index 00000000..8f3e6657 --- /dev/null +++ b/src/main/kotlin/biokotlin/gff/Feature.kt @@ -0,0 +1,158 @@ +package biokotlin.gff + +enum class FeatureType { + Gene, Exon, Leader, Terminator, Coding, mRNA, Intron, Chromosome, Scaffold; + + companion object { + /** + * Converts from standard names in GFF files to a FeatureType. Case-insensitive + */ + fun fromGffString(gffString: String): FeatureType { + return when (gffString.lowercase()) { + "gene" -> Gene + "exon" -> Exon + "five_prime_utr" -> Leader + "three_prime_utr" -> Terminator + "cds" -> Coding + "mrna" -> mRNA + "intron" -> Intron + "chromosome" -> Chromosome + "scaffold" -> Scaffold + else -> throw Exception("Feature $gffString is not supported") + } + } + } +} + +/** + * @param seqid The ID of the landmark used to establish the coordinate system for the current feature. IDs may contain + * any characters, but must escape any characters not in the set [a-zA-Z0-9.:^*$@!+_?-|]. In particular, IDs may not + * contain unescaped whitespace and must not begin with an unescaped ">". + * @param source The source is a free text qualifier intended to describe the algorithm or operating procedure that + * generated this feature. Typically this is the name of a piece of software, such as "Genescan" or a database name, + * such as "Genbank." In effect, the source is used to extend the feature ontology by adding a qualifier to the type + * creating a new composite type that is a subclass of the type in the type column. + * @param start The start and end coordinates of the feature are given in positive 1-based integer coordinates, relative + * to the landmark given in column one. Start is always less than or equal to end. For features that cross the origin + * of a circular feature (e.g. most bacterial genomes, plasmids, and some viral genomes), the requirement for start to + * be less than or equal to end is satisfied by making end = the position of the end + the length of the landmark feature. + * @param end The start and end coordinates of the feature are given in positive 1-based integer coordinates, relative + * to the landmark given in column one. Start is always less than or equal to end. For features that cross the origin + * of a circular feature (e.g. most bacterial genomes, plasmids, and some viral genomes), the requirement for start to + * be less than or equal to end is satisfied by making end = the position of the end + the length of the landmark feature. + * @param score The score of the feature, a floating point number. As in earlier versions of the format, the semantics + * of the score are ill-defined. It is strongly recommended that E-values be used for sequence similarity features, + * and that P-values be used for ab initio gene prediction features. + * @param strand The strand of the feature. + for positive strand (relative to the landmark), - for minus strand, + * and . for features that are not stranded. In addition, ? can be used for features whose strandedness is relevant, + * but unknown. + * @param phase For features of type "CDS", the phase indicates where the next codon begins relative to the 5' end + * (where the 5' end of the CDS is relative to the strand of the CDS feature) of the current CDS feature. For + * clarification the 5' end for CDS features on the plus strand is the feature's start and and the 5' end for CDS + * features on the minus strand is the feature's end. The phase is one of the integers 0, 1, or 2, indicating the + * number of bases forward from the start of the current CDS feature the next codon begins. A phase of "0" indicates + * that a codon begins on the first nucleotide of the CDS feature (i.e. 0 bases forward), a phase of "1" indicates + * that the codon begins at the second nucleotide of this CDS feature and a phase of "2" indicates that the codon + * begins at the third nucleotide of this region. Note that ‘Phase’ in the context of a GFF3 CDS feature should not + * be confused with the similar concept of frame that is also a common concept in bioinformatics. Frame is generally + * calculated as a value for a given base relative to the start of the complete open reading frame (ORF) or the + * codon (e.g. modulo 3) while CDS phase describes the start of the next codon relative to a given CDS feature. + * @param attributes A list of feature attributes in the format tag=value. Multiple tag=value pairs are separated + * by semicolons. URL escaping rules are used for tags or values containing the following characters: ",=;". Spaces + * are allowed in this field, but tabs must be replaced with the %09 URL escape. Attribute values do not need to be + * and should not be quoted. The quotes should be included as part of the value by parsers and not stripped. + * @param children TODO + * @see FeatureBuilder + */ +abstract class Feature( + val seqid: String, + val source: String, + val start: Int, + val end: Int, + val score: Double = Double.NaN, + val strand: String = "+", + val phase: String = ".", + var attributes: Map = emptyMap(), + var children: List = emptyList() +) { + + init { + attributes = attributes.toMap() + children = children.sortedWith(FeatureComparator()) + } + + abstract fun type(): FeatureType + + fun attribute(key: String) = attributes[key] + + //TODO make this an actual pointer and handle multiple parents + fun parent(): Feature = TODO() + + fun id() = attributes["ID"] + + fun name() = attributes["Name"] + + fun alias() = attributes["Alias"] + + fun target() = attributes["Target"] + + fun gap() = attributes["Gap"] + + fun derivesFrom() = attributes["Derives_from"] + + fun note() = attributes["Note"] + + fun dbxref() = attributes["Dbxref"] + + fun ontologyTerm() = attributes["Ontology_term"] + + fun isCircular() = attributes["Is_circular"] + + /** + * Compares this to [other] alphabetically by seqid, then by start, then by end position. + * Returns zero if this and [other] are equal in ordering, a negative number if this is less + * than [other], or a positive number if this is greater than [other]. + */ + fun compareTo(other: Feature): Int { + return if (seqid.compareTo(other.seqid) == 0) { + if (start.compareTo(other.start) == 0) { + end.compareTo(other.end) + } else { + start.compareTo(other.start) + } + } else { + seqid.compareTo(other.seqid) + } + } + /** + * Returns the feature as a string representing row in a GFF file + */ + override fun toString(): String { + val scoreString = if (score.isNaN()) { + "." + } else { + score.toString() + } + + val attributesString = StringBuilder() + for ((tag, value) in attributes) { + attributesString.append("$tag=$value;") + } + + return "$seqid\t$source\t${type()}\t$start\t$end\t$scoreString\t$strand\t$phase\t${attributesString}\n" + } +} + +/** + * Provides ordering for feature + */ +class FeatureComparator: Comparator { + /** + * Returns the same result as [p0].compareTo([p1]) unless one of the arguments is null, in which case it returns 0. + */ + override fun compare(p0: Feature?, p1: Feature?): Int { + if (p0 == null || p1 == null) return 0 + + return p0.compareTo(p1) + } +} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/FeatureBuilder.kt b/src/main/kotlin/biokotlin/gff/FeatureBuilder.kt new file mode 100644 index 00000000..856081da --- /dev/null +++ b/src/main/kotlin/biokotlin/gff/FeatureBuilder.kt @@ -0,0 +1,50 @@ +package biokotlin.gff + +//TODO add pointer to parent(s) +/** + * A mutable representation of a genetic feature that can be built into a [Feature]. + * @see Feature + */ +class FeatureBuilder( + val seqid: String, + val source: String, + val type: FeatureType, + val start: Int, + val end: Int, + val score: Double = 0.0, + val strand: String = "+", + val phase: String = ".", + var attributes: Map = emptyMap() +) { + + val children = mutableListOf() + + fun id() = attributes["ID"] + + fun add(child: FeatureBuilder) { + children.add(child) + } + + /** + * Builds this feature and its children, recursively. + * @return An immutable [Feature] with the properties of this [FeatureBuilder] and whose children are built + * versions of the this [FeatureBuilder]'s children, sorted by [FeatureComparator]. + */ + fun build(): Feature { + + val children = children.map { it.build() } + return when (type) { + FeatureType.Gene -> Gene(seqid, source, start, end, score, strand, phase, attributes, children) + FeatureType.Exon -> Exon(seqid, source, start, end, score, strand, phase, attributes, children) + FeatureType.Leader -> Leader(seqid, source, start, end, score, strand, phase, attributes, children) + FeatureType.Terminator -> Terminator(seqid, source, start, end, score, strand, phase, attributes, children) + FeatureType.Coding -> Coding(seqid, source, start, end, score, strand, phase, attributes, children) + FeatureType.mRNA -> MRNA(seqid, source, start, end, score, strand, phase, attributes, children) + FeatureType.Intron -> Intron(seqid, source, start, end) + FeatureType.Chromosome -> Chromosome(seqid, source, start, end, attributes, children) + FeatureType.Scaffold -> Scaffold(seqid, source, start, end, attributes, children) + } + + } + +} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/Gene.kt b/src/main/kotlin/biokotlin/gff/Gene.kt new file mode 100644 index 00000000..3f945339 --- /dev/null +++ b/src/main/kotlin/biokotlin/gff/Gene.kt @@ -0,0 +1,22 @@ +package biokotlin.gff + + +class Gene( + seqid: String, + source: String, + start: Int, + end: Int, + score: Double = Double.NaN, + strand: String = "+", + phase: String = ".", + attributes: Map = emptyMap(), + children: List = emptyList() +) : Feature(seqid, source, start, end, score, strand, phase, attributes, children) { + + override fun type(): FeatureType = FeatureType.Gene + + fun mRNAs(): List { + return children.filterIsInstance().toList() + } + +} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/GffTree.kt b/src/main/kotlin/biokotlin/gff/GffTree.kt new file mode 100644 index 00000000..e9492971 --- /dev/null +++ b/src/main/kotlin/biokotlin/gff/GffTree.kt @@ -0,0 +1,165 @@ +package biokotlin.gff + +import biokotlin.util.bufferedReader + +/** + * A tree representation of a GFF file, with parent/child relationships in the tree created from the Parent attribute + * of elements in the GFF file. + * @param gff Path to the GFF file + */ +class GffTree(val gff: String): Iterable { + /** + * The top-level features of the gff, sorted by [biokotlin.gff.FeatureComparator] + */ + val roots = parseRoots(gff) + + /** + * A map of all IDs to the feature with that ID. Features without IDs are excluded. + * TODO This shouldn't be a list + */ + val idMap: Map by lazy { + getMap ( {it.id()} ) + } + + //TODO add more map presets + + /** + * Iterates over the GffTree. Starts at the first root (as defined by the order in [biokotlin.gff.FeatureComparator]), + * performs a depth-first traversal, then moves to the next root. + */ + override fun iterator(): Iterator { + return toList().iterator() + } + + /** + * Converts the tree representation to a list. The first element is the first root (as defined by the order in + * [biokotlin.gff.FeatureComparator]), then a depth-first ordering of its children, then the next root and a + * depth-first ordering of its children, etc. + */ + fun toList(): List { + TODO("Not yet implemented") + } + + /** + * Returns a map where each feature in the [GffTree] is added to a list with the key `mapping(feature)`. + * The features in each list will be ordered such that earlier elements in [iterator] are prior to later elements + * in [iterator]. When [mapping] returns `null`, no key-value pair is added to the map. The insertion order of + * the map follows the order of [iterator]. + * + * This can be useful for accessing features by their non-standard attributes + * ```kotlin + * val gffTree = GffTree("/home/user/b73.gff3") + * //Map the features by the non-standard protein_id attribute + * val proteinIDMap = gffTree.getMap { it.attributes["protein_id"] } + * //Access all features that have "Zm00001eb000020_P004" in their "protein_id" attribute + * val features = proteinIDMap["Zm00001eb000020_P004"] + * ``` + * + * Combined with custom data classes, this method also allows for convenient access of all features + * that have a particular combination of properties. + * + * Let's say you have a GFF where exons do not have unique IDs, but they can be uniquely defined by their parent's + * ID and their rank. + * ```kotlin + * data class parentAndRank(val parent: String, val rank: String) + * val gffTree = GffTree("/home/user/myGff.gff3") + * val parentAndRankMap = gffTree.getMap { feature -> + * val parent = feature.attributes["Parent"] + * val rank = feature.attributes["rank"] + * //Do not create mappings for anything that does not have a parent or a rank + * if (parent == null || rank == null) null + * else parentAndRank(parent, rank) + * } + * //Pull out the third exon on transcript Zm00001eb442870_T001 + * val exon = parentAndRankMap[parentAndRank("Zm00001eb442870_T001", "3")] + * ``` + * + * @see idMap + * @param mapping The function that produces keys for a given feature + */ + fun getMap(mapping: (Feature) -> T?): Map> { + TODO("Not yet implemented") + } + +} + +fun main() { + val gffTree = GffTree("/home/user/myGff.gff3") + val proteinIDs = gffTree.getMap { it.attributes["protein_id"] } + val protein = proteinIDs["Zm00001eb442910_P001"] // List of CDS regions + val cds = protein?.get(0) + val transcript = cds?.parent() + val gene = transcript?.parent() + +/** + * Parses a GFF and returns a list of top-level features sorted by [biokotlin.gff.FeatureComparator] + * @param gff Path to gff file + */ +private fun parseRoots(gff: String): List { + val file = bufferedReader(gff) + val roots = MutableList(0) { FeatureBuilder("", "", FeatureType.Chromosome, 0, 0) } + val orphans = MutableList(0) { FeatureBuilder("", "", FeatureType.Chromosome, 0, 0) } + val registry = HashMap(0) //id to feature builder + + var line = file.readLine() + var lineNumber = 0 + while (line != null) { + if (line.startsWith("#")) { + line = file.readLine() + lineNumber++ + continue + } + + val split = line.split("\t") + val attributes = split[8].split(";") + val attributeMap = HashMap(0) + attributes.forEach { + val splitAttribute = it.split("=") + attributeMap[splitAttribute[0]] = splitAttribute[1] + } + val score = if (split[5] == ".") { + Double.NaN + } else { + split[5].toDouble() + } + val featureBuilder = FeatureBuilder( + split[0], + split[1], + FeatureType.fromGffString(split[2]), + split[3].toInt(), + split[4].toInt(), + score, + split[6], + split[7], + attributeMap + ) + + if (featureBuilder.attributes["ID"] != null) { + registry[featureBuilder.attributes["ID"]!!] = featureBuilder + } + + if (featureBuilder.attributes["Parent"] != null) { + if (registry.contains(featureBuilder.attributes["Parent"])) { + registry[featureBuilder.attributes["Parent"]]?.add(featureBuilder) + } else { + orphans.add(featureBuilder) + } + } else { + roots.add(featureBuilder) + } + + line = file.readLine() + lineNumber++ + } + + for (orphan in orphans) { + if (registry.contains(orphan.attributes["Parent"])) { + registry[orphan.attributes["Parent"]]?.add(orphan) + } else { + roots.add(orphan) + println("Warning: Orphaned element. Parent ${orphan.attributes["Parent"]} is not in the file") + } + } + + return roots.map { it.build() }.sortedWith(FeatureComparator()) +} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/Intron.kt b/src/main/kotlin/biokotlin/gff/Intron.kt new file mode 100644 index 00000000..d11a429a --- /dev/null +++ b/src/main/kotlin/biokotlin/gff/Intron.kt @@ -0,0 +1,12 @@ +package biokotlin.gff + +class Intron( + seqid: String, + source: String, + start: Int, + end: Int +) : Feature(seqid, source, start, end) { + + override fun type(): FeatureType = FeatureType.Intron + +} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/Leader.kt b/src/main/kotlin/biokotlin/gff/Leader.kt new file mode 100644 index 00000000..c2518a5c --- /dev/null +++ b/src/main/kotlin/biokotlin/gff/Leader.kt @@ -0,0 +1,20 @@ +package biokotlin.gff + +/** + * Also known as 5' UTR + */ +class Leader( + seqid: String, + source: String, + start: Int, + end: Int, + score: Double = Double.NaN, + strand: String = "+", + phase: String = ".", + attributes: Map = emptyMap(), + children: List = emptyList() +) : Feature(seqid, source, start, end, score, strand, phase, attributes, children) { + + override fun type(): FeatureType = FeatureType.Leader + +} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/MRNA.kt b/src/main/kotlin/biokotlin/gff/MRNA.kt new file mode 100644 index 00000000..4960c755 --- /dev/null +++ b/src/main/kotlin/biokotlin/gff/MRNA.kt @@ -0,0 +1,64 @@ +package biokotlin.gff + +/** + * Also known as a Transcript + */ +class MRNA( + seqid: String, + source: String, + start: Int, + end: Int, + score: Double = Double.NaN, + strand: String = "+", + phase: String = ".", + attributes: Map = emptyMap(), + children: List = emptyList() +) : Feature(seqid, source, start, end, score, strand, phase, attributes, children) { + + override fun type(): FeatureType = FeatureType.mRNA + + /** + * @return A list of direct children that are an instance of [Coding], in order as defined by [FeatureComparator] + */ + fun coding(): List { + return children.filterIsInstance().toList() + } + + /** + * @return A list of direct children that are an instance of [Exon], in order as defined by [FeatureComparator] + */ + fun exons(): List { + return children.filterIsInstance().toList() + } + + /** + * @return A list of direct children that are an instance of [Leader], in order as defined by [FeatureComparator] + */ + fun leaders(): List { + return children.filterIsInstance().toList() + } + + /** + * @return A list of direct children that are an instance of [Terminator], in order as defined by [FeatureComparator] + */ + fun terminators(): List { + return children.filterIsInstance().toList() + } + + /** + * @return Introns in direct children + */ + fun introns(): List { + return children + .chunked(2) + .filter { it.size == 2 } + .map { + val seqid = it[0].seqid + val source = it[0].source + val start = it[0].end + val end = it[1].start + Intron(seqid, source, start, end) + } + } + +} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/MakeGffForNewSpecies.kt b/src/main/kotlin/biokotlin/gff/MakeGffForNewSpecies.kt new file mode 100644 index 00000000..c92ce53a --- /dev/null +++ b/src/main/kotlin/biokotlin/gff/MakeGffForNewSpecies.kt @@ -0,0 +1,437 @@ +package biokotlin.gff + +import biokotlin.util.bufferedWriter +import htsjdk.samtools.CigarOperator +import htsjdk.samtools.SAMRecord +import htsjdk.samtools.SamReaderFactory +import htsjdk.samtools.ValidationStringency +import java.io.File + +/** + * Feature operator classifiers + * * `U` = undefined (`S`, `H`) + * * `I` = intron (`N`) + * * `C` = coding (`M`, `D`, `I`**, `=`, `X`) <- `I` is ignored + */ +enum class FeatureOperator { + U, C, I +} + +/** + * Data class for transcript features + * * `length` = length of feature + * * `operator` = what is the feature type? (*See `FeatureOperator` for more info*) + */ +data class TranscriptFeatures( + val length: Int, + val operator: FeatureOperator, + var id: Int +) + +/** + * Data class for BED columns + * * `chrom` = contig name + * * `chromStart` = start coordinate for sequence considered + * * `chromEnd` = end coordinate for sequence considered + * * `name` = ID for range + * * `strand` = DNA strand (`+`, `-`, or `.`) + */ +data class BedFeatures( + val chrom: String, + val chromStart: Int, + val chromEnd: Int, + val name: String, + val strand: Char + ) + +/** + * Data class for GFF columns (descriptions from https://useast.ensembl.org/info/website/upload/gff3.html) + * * seqid - name of the chromosome or scaffold + * * source - name of the program that generated this feature, or the data source (database/project name) + * * type - type of feature //TODO check which types our features can be (presumably only a subset of all possible types) + * * start - start position (STARTS COUNTING AT 1) + * * end - end position (STARTS COUNTING AT 1, INCLUSIVE) + * * score - floating point value + * * strand - '+', '-', or '.' + * * phase - One of '.', '0', '1' or '2'. '0' indicates that the first base of the feature is the first base of a codon, + * '1' that the second base is the first base of a codon, and so on.. + * * attributes - a map of tag-value pairs. Attribute tags with special meanings found at http://gmod.org/wiki/GFF3#GFF3_Format + * * key attributes: Parent, ID, Name (can be same as ID) + */ +data class GffFeatures( + val seqid: String, + val source: String, + val type: String, + val start: Int, + val end: Int, + val score: String, + val strand: Char, + val phase: Char, + val attributes: Map, +) { + /** + * Converts this GffFeature into a GFF row + */ + fun asRow(): String { + val sb = StringBuilder() + for ((tag, value) in attributes) { + sb.append("$tag=$value;") + } + return "$seqid\t$source\t$type\t$start\t$end\t$score\t$strand\t$phase\t$sb\n" + } +} + +// ZACK FUNCTIONS ---- + +data class TranscriptBpAlignmentStats(val xOrEqArray: MutableList, + val eqArray:MutableList, + var nCounts:Int = 0, + var dCounts:Int = 0, + var numAlignments:Int = 0 ) + +enum class ExonOperator { + L, C, T +} +val exonOperatorMap = mapOf('L' to ExonOperator.L,'C' to ExonOperator.C,'T' to ExonOperator.T) +data class ExonBoundary(val length: Int, val operator: ExonOperator) + +/** + * Function to take a single exon and parse out the operator and the length of that operation. + * It needs to have a running sub list of the number characters until it sees an operator. + * Then it will convert the temp String into an Int and save out the boundary. + */ +fun convertExonStringToBoundary(exonString: String) : List { + var runningCount = StringBuilder() + val exonBoundaries = mutableListOf() + for(element in exonString) { + //Check to see if we hit an operator + if(element in exonOperatorMap.keys) { + //if so, turn the runningCount into an Int and add the boundary to the list. + val countString = runningCount.toString() + check(!countString.isEmpty()) {"Error parsing exon name. No counts for operator"} + + exonBoundaries.add(ExonBoundary(countString.toInt(), exonOperatorMap[element]!!)) + runningCount.clear() + } + else { + runningCount.append("$element") + } + } + + return exonBoundaries +} + +/** + * Function to parse out the exon boundaries from the transcript name. + * They take the form: transcript_1234:100L24C-12C-90C10T + */ +fun parseExonBoundaries(transcriptName: String) : List> { + val exonStrings = transcriptName.split(":")[1].split("-") + + return exonStrings.map { convertExonStringToBoundary(it) } +} + +/** + * Compute CDS positions (Pair) + */ +fun computeCDSPositions(exonBoundaries:List>) : Pair { + val sizesOfOperators = exonBoundaries.flatten() + .groupBy { it.operator } + .map { Pair(it.key, it.value.map { currValue -> currValue.length }.sum()) } + .toMap() + + val leaderSize = sizesOfOperators[ExonOperator.L]?:0 + + val cdsSize = sizesOfOperators[ExonOperator.C]?:0 + + return Pair(leaderSize, leaderSize + cdsSize - 1) +} + +/** + * Function that actually creates the xOrEQ array and the eqArray. It also counts the number of N and D bps + */ +fun buildTranscriptBpAlignmentStats(samRecord: SAMRecord) : TranscriptBpAlignmentStats { + val xOrEqArray = Array(samRecord.readLength) { 0 } + val eqArray = Array(samRecord.readLength) { 0 } + + var nCounts = 0 + var dCounts = 0 + + var currentBp = 0 + val cigarElements = samRecord.cigar.cigarElements + + //Loop through the cigarElements + for(element in cigarElements) { + val operator = element.operator + val count = element.length + + if(operator== CigarOperator.N) { + nCounts+=count + } + else if(operator==CigarOperator.D) { + dCounts+=count + } + //Check to see if consumes query + else if(operator.consumesReadBases()) { + //If it consumes read bases, we can walk through the length of the CIGAR operator and set the position as a 1 + for(index in 0 until count) { + if(operator.isAlignment) { + xOrEqArray[currentBp] = 1 + } + + if (operator==CigarOperator.EQ) { + eqArray[currentBp] = 1 + } + + currentBp++ + } + } + + } + + //Check to see if it was reversed during alignment. If so we need to flip our arrays. + return if(samRecord.readNegativeStrandFlag) { + TranscriptBpAlignmentStats(xOrEqArray.reversed().toMutableList(), eqArray.reversed().toMutableList(), nCounts, dCounts,1) + } + else { + TranscriptBpAlignmentStats(xOrEqArray.toMutableList(), eqArray.toMutableList(),nCounts, dCounts,1) + } + +} + +/** + * Generate a list of feature lengths from a CIGAR string + * @param featureLengths A list of `TranscriptFeature` data objects + * @param samRecord A `SAMRecord` object + * @param taxaId What reference assembly was this aligned to? + * @return A list of `BedFeatures` data objects. + */ +fun calculateRanges(featureLengths: List, samRecord: SAMRecord, taxaId: String): MutableList { + var aggLength = samRecord.alignmentStart + val contig = samRecord.referenceName + val bedFeatures = mutableListOf() + val zmTranscriptRanges = parseExonBoundaries(samRecord.readName).flatten() + val strand = when { + samRecord.readNegativeStrandFlag -> '-' + else -> '+' + } + val prefixReadName = samRecord.readName.substringBefore(":") + + for (feature in featureLengths) { + if (feature.operator == FeatureOperator.C || feature.operator == FeatureOperator.I) { + bedFeatures.add( + BedFeatures( + contig, + (aggLength) - 1, + (aggLength + feature.length - 1), + "${taxaId}_${prefixReadName}_${feature.operator}.${feature.id}", + strand + ) + ) + aggLength += feature.length + } else { + continue + } + } + return bedFeatures +} + +/** + * Generate a list of feature lengths from a CIGAR string + * @param samRecord A SAM record entry + * @param taxaId What reference assembly was this aligned to? + * @return A list of `BedFeatures` data objects. + */ +fun buildFeatureRanges(samRecord: SAMRecord, taxaId: String): MutableList { + val cigarElements = samRecord.cigar.cigarElements + val transcriptFeatures = mutableListOf() + + // Loop through CIGAR elements + var sumLength = 0 + var i = 0 + var iCount = 1 + var cCount = 1 + var uCount = 1 + for (element in cigarElements) { + val operator = element.operator + + if (operator != CigarOperator.I) { + sumLength += element.length + } + // FIX - change to last iterator [prev: element == cigarElements.last()] + if (i == cigarElements.lastIndex && (operator.isAlignment || operator == CigarOperator.D)) { + transcriptFeatures.add(TranscriptFeatures(sumLength, FeatureOperator.C, cCount)) + break + } + // TODO: 2/6/2022 - how to handle I/D elements? + when (operator) { + CigarOperator.S, CigarOperator.H -> { + transcriptFeatures.add(TranscriptFeatures(element.length, FeatureOperator.U, uCount)) + sumLength = 0 + uCount++ + } + CigarOperator.N -> { + transcriptFeatures.add(TranscriptFeatures(element.length, FeatureOperator.I, iCount)) + sumLength = 0 + iCount++ + } + else -> { + if (cigarElements[i + 1].operator == CigarOperator.N || cigarElements[i + 1].operator.isClipping) { + transcriptFeatures.add(TranscriptFeatures(sumLength, FeatureOperator.C, cCount)) + cCount++ + } + } + + } + + i++ + } + + // Check to see if it was reversed during alignment. If so, we need to reverse the ID order (e.g. 1-18 -> 18-1). + val featureLengths = when { + samRecord.readNegativeStrandFlag -> { + val idList = mutableListOf() + transcriptFeatures.forEach { idList.add(it.id) } + val revIdList = idList.reversed() + var j = 0 + transcriptFeatures.forEach { + it.id = revIdList[j] + j++ + } + transcriptFeatures.toMutableList() + } + else -> { + transcriptFeatures + } + } + + return calculateRanges(featureLengths, samRecord, taxaId) +} + +/** + * Writes a BED file based on a SAM file. Generates coordinates for + * introns (I), coding regions (C), and whole transcripts. IDs (e.g. C.1) + * are based on strand (+, -). + * @param samFile A String object referring to a SAM file + * @param outputFile A String object referring to the output BED file + * @param taxaId What reference assembly was this aligned to? + * @param minAlignmentPercent Percent match threshold to alignment region. + */ +fun writeBedFile(samFile: String, outputFile: String, taxaId: String, minAlignmentPercent: Double = 0.90) { + // Read in SAM and creater iterator object ---- + println("Processing $taxaId ...") + val timeStartRead = System.currentTimeMillis() + val samIterator = SamReaderFactory.makeDefault() + .validationStringency(ValidationStringency.SILENT) + .setOption(SamReaderFactory.Option.CACHE_FILE_BASED_INDEXES, false) + .setOption(SamReaderFactory.Option.EAGERLY_DECODE, false) + .setOption(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, false) + .setOption(SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS, false) + .open(File(samFile)) + .iterator() + val timeEndRead = System.currentTimeMillis() - timeStartRead + println("Elapsed read time: $timeEndRead ms") + + // Make buffered reader and write to file ---- + val timeStart = System.currentTimeMillis() + val bw = bufferedWriter(outputFile) + while(samIterator.hasNext()) { + val currentRecord = samIterator.next() + + // Check for secondary, supplementary, or unmapped reads ---- + if (!currentRecord.isSecondaryOrSupplementary && !currentRecord.readUnmappedFlag) { + + // Check for minimum alignment % ---- + val exonBoundaries = parseExonBoundaries(currentRecord.readName) + val cdsBoundaries = computeCDSPositions(exonBoundaries) + val stats = buildTranscriptBpAlignmentStats(currentRecord) + val numMapping = stats.xOrEqArray.slice(cdsBoundaries.first .. cdsBoundaries.second).sum() + val alignmentPercentage = (numMapping.toDouble() / (cdsBoundaries.second - cdsBoundaries.first + 1)) + if (alignmentPercentage >= minAlignmentPercent) { + val bedStats = buildFeatureRanges(currentRecord, taxaId) // make BED ranges + val transId = "${currentRecord.referenceName}\t${currentRecord.alignmentStart - 1}\t${currentRecord.alignmentEnd}\t${taxaId}_${currentRecord.readName.substringBefore(":")}\t0\t${bedStats[0].strand}\n" + bw.write(transId) + bedStats.forEach { + bw.write("${it.chrom}\t${it.chromStart}\t${it.chromEnd}\t${it.name}\t0\t${it.strand}\n") + } + } + + } + } + bw.close() + + // Get time stats ---- + val timeEnd = System.currentTimeMillis() - timeStart + println("Elapsed BED creation time: $timeEnd ms") +} + +/** + * Writes a GFF file based on the SAM file. + * @param samFile A String object referring to a SAM file + * @param outputFile A String object referring to the output BED file + * @param taxaId What reference assembly was this aligned to? + * @param minQuality Percent match threshold to alignment region. + * @param maxNumber //TODO how to define this? + */ +fun writeGffFile(samFile: String, outputFile: String, taxaId: String, minQuality: Double = 0.90, maxNumber: Int = 1) { + // Read in SAM and creater iterator object ---- + println("Processing $taxaId ...") + val timeStartRead = System.currentTimeMillis() + val samIterator = SamReaderFactory.makeDefault() + .validationStringency(ValidationStringency.SILENT) + .setOption(SamReaderFactory.Option.CACHE_FILE_BASED_INDEXES, false) + .setOption(SamReaderFactory.Option.EAGERLY_DECODE, false) + .setOption(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, false) + .setOption(SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS, false) + .open(File(samFile)) + .iterator() + val timeEndRead = System.currentTimeMillis() - timeStartRead + println("Elapsed read time: $timeEndRead ms") + + // Make buffered reader and write to file ---- + val timeStart = System.currentTimeMillis() + val bw = bufferedWriter(outputFile) + while(samIterator.hasNext()) { + val currentRecord = samIterator.next() + + // Check for secondary, supplementary, or unmapped reads ---- + if (!currentRecord.isSecondaryOrSupplementary && !currentRecord.readUnmappedFlag) { + + // Check for minimum alignment % ---- + val exonBoundaries = parseExonBoundaries(currentRecord.readName) + val cdsBoundaries = computeCDSPositions(exonBoundaries) + val stats = buildTranscriptBpAlignmentStats(currentRecord) + val numMapping = stats.xOrEqArray.slice(cdsBoundaries.first .. cdsBoundaries.second).sum() + val alignmentPercentage = (numMapping.toDouble() / (cdsBoundaries.second - cdsBoundaries.first + 1)) + if (alignmentPercentage >= minQuality) { + val bedStats = buildFeatureRanges(currentRecord, taxaId) // make BED ranges //TODO make these GFF + + val topID = "${taxaId}_${currentRecord.readName.substringBefore(":")}"; + //TODO does this parent represent mRNA? + //TODO fill in the values + val mRNA = GffFeatures( + currentRecord.referenceName, + "SOURCE", + "mRNA", + currentRecord.alignmentStart, + currentRecord.alignmentEnd, + ".", + bedStats[0].strand, + '.', + mapOf("ID" to topID, "Name" to topID) + ) + bw.write(mRNA.asRow()) + //TODO key line to change + bedStats.forEach { + bw.write("${it.chrom}\t${it.chromStart}\t${it.chromEnd}\t${it.name}\t0\t${it.strand}\n") + } + } + + } + } + bw.close() + + // Get time stats ---- + val timeEnd = System.currentTimeMillis() - timeStart + println("Elapsed BED creation time: $timeEnd ms") +} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/Scaffold.kt b/src/main/kotlin/biokotlin/gff/Scaffold.kt new file mode 100644 index 00000000..476f5369 --- /dev/null +++ b/src/main/kotlin/biokotlin/gff/Scaffold.kt @@ -0,0 +1,14 @@ +package biokotlin.gff + +class Scaffold( + seqid: String, + source: String, + start: Int, + end: Int, + attributes: Map = emptyMap(), + children: List = emptyList() +) : Feature(seqid, source, start, end, attributes = attributes, children = children) { + + override fun type(): FeatureType = FeatureType.Scaffold + +} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/gff/Terminator.kt b/src/main/kotlin/biokotlin/gff/Terminator.kt new file mode 100644 index 00000000..9b6e5a12 --- /dev/null +++ b/src/main/kotlin/biokotlin/gff/Terminator.kt @@ -0,0 +1,20 @@ +package biokotlin.gff + +/** + * Also known as 3' UTR + */ +class Terminator( + seqid: String, + source: String, + start: Int, + end: Int, + score: Double = Double.NaN, + strand: String = "+", + phase: String = ".", + attributes: Map = emptyMap(), + children: List = emptyList() +) : Feature(seqid, source, start, end, score, strand, phase, attributes, children) { + + override fun type(): FeatureType = FeatureType.Terminator + +} \ No newline at end of file diff --git a/src/main/kotlin/biokotlin/util/FileIO.kt b/src/main/kotlin/biokotlin/util/FileIO.kt index b4a92097..7eba853d 100644 --- a/src/main/kotlin/biokotlin/util/FileIO.kt +++ b/src/main/kotlin/biokotlin/util/FileIO.kt @@ -2,10 +2,10 @@ package biokotlin.util -import java.io.BufferedReader -import java.io.File +import java.io.* import java.net.URL import java.util.zip.GZIPInputStream +import java.util.zip.GZIPOutputStream fun bufferedReader(filename: String): BufferedReader { return try { @@ -21,6 +21,18 @@ fun bufferedReader(filename: String): BufferedReader { File(filename).bufferedReader() } } catch (e: Exception) { - throw IllegalStateException("FileIO: getBufferedReader: problem getting reader: " + e.message) + throw IllegalStateException("FileIO: bufferedReader: problem getting reader: " + e.message) + } +} + +fun bufferedWriter(filename: String, append: Boolean = false): BufferedWriter { + return try { + if (filename.endsWith(".gz")) { + BufferedWriter(OutputStreamWriter(GZIPOutputStream(FileOutputStream(filename, append)))) + } else { + BufferedWriter(OutputStreamWriter(FileOutputStream(filename, append))) + } + } catch (e: Exception) { + throw IllegalStateException("FileIO: bufferedWriter: problem getting writer: " + e.message) } } \ No newline at end of file