Skip to content

Commit cf3a7dc

Browse files
committed
Optimized lazy materialization + parallel sort for Kotlin Notebooks
1 parent d76a2d3 commit cf3a7dc

File tree

3 files changed

+127
-2
lines changed

3 files changed

+127
-2
lines changed

core/build.gradle.kts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ plugins {
1717
alias(kodex)
1818
alias(buildconfig)
1919
alias(binary.compatibility.validator)
20+
alias(kotlinx.benchmark)
2021

2122
// generates keywords using the :generator module
2223
alias(keywordGenerator)
@@ -75,6 +76,7 @@ dependencies {
7576
testImplementation(libs.kotestAssertions) {
7677
exclude("org.jetbrains.kotlin", "kotlin-stdlib-jdk8")
7778
}
79+
testImplementation(libs.kotlinx.benchmark.runtime)
7880
testImplementation(libs.kotlin.scriptingJvm)
7981
testImplementation(libs.jsoup)
8082
testImplementation(libs.sl4jsimple)
@@ -89,6 +91,17 @@ dependencies {
8991
testImplementation(projects.dataframeCsv)
9092
}
9193

94+
benchmark {
95+
targets {
96+
register("test")
97+
}
98+
configurations {
99+
register("sort") {
100+
include("SortingBenchmark")
101+
}
102+
}
103+
}
104+
92105
val samplesImplementation by configurations.getting {
93106
extendsFrom(configurations.testImplementation.get())
94107
}

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/KotlinNotebookPluginUtils.kt

Lines changed: 55 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package org.jetbrains.kotlinx.dataframe.jupyter
33
import org.jetbrains.kotlinx.dataframe.AnyCol
44
import org.jetbrains.kotlinx.dataframe.AnyFrame
55
import org.jetbrains.kotlinx.dataframe.AnyRow
6+
import org.jetbrains.kotlinx.dataframe.DataFrame
67
import org.jetbrains.kotlinx.dataframe.DataRow
78
import org.jetbrains.kotlinx.dataframe.annotations.RequiredByIntellijPlugin
89
import org.jetbrains.kotlinx.dataframe.api.Convert
@@ -29,15 +30,17 @@ import org.jetbrains.kotlinx.dataframe.api.at
2930
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
3031
import org.jetbrains.kotlinx.dataframe.api.frames
3132
import org.jetbrains.kotlinx.dataframe.api.getColumn
33+
import org.jetbrains.kotlinx.dataframe.api.getRows
3234
import org.jetbrains.kotlinx.dataframe.api.into
3335
import org.jetbrains.kotlinx.dataframe.api.isFrameColumn
3436
import org.jetbrains.kotlinx.dataframe.api.isList
35-
import org.jetbrains.kotlinx.dataframe.api.sortWith
37+
import org.jetbrains.kotlinx.dataframe.api.rows
3638
import org.jetbrains.kotlinx.dataframe.api.toDataFrame
3739
import org.jetbrains.kotlinx.dataframe.api.values
3840
import org.jetbrains.kotlinx.dataframe.api.valuesAreComparable
3941
import org.jetbrains.kotlinx.dataframe.columns.ColumnPath
4042
import org.jetbrains.kotlinx.dataframe.impl.ColumnNameGenerator
43+
import java.util.Arrays
4144

4245
/**
4346
* A class with utility methods for Kotlin Notebook Plugin integration.
@@ -105,9 +108,34 @@ public object KotlinNotebookPluginUtils {
105108
ColumnPath(path)
106109
}
107110

111+
if (sortKeys.size == 1) {
112+
val column = df.getColumn(sortKeys[0])
113+
114+
// Not sure how to have generic logic that would produce Comparator<Int> and Comparator<DataRow> without overhead
115+
// For now Comparator<DataRow> is needed for fallback case of sorting multiple columns. Although it's now impossible in UI
116+
// Please make sure to change both this and createColumnComparator
117+
val comparator: Comparator<Int> = when {
118+
column.valuesAreComparable() -> compareBy(nullsLast()) {
119+
column[it] as Comparable<Any>?
120+
}
121+
122+
column.isFrameColumn() -> compareBy { column[it].rowsCount() }
123+
124+
column.isList() -> compareBy { (column[it] as? List<*>)?.size ?: 0 }
125+
126+
else -> compareBy { column[it]?.toString() ?: "" }
127+
}
128+
129+
val finalComparator = if (isDesc[0]) comparator.reversed() else comparator
130+
131+
val permutation = Array(column.size()) { it }
132+
Arrays.parallelSort(permutation, finalComparator)
133+
return SortedDataFrameView(df, permutation.asList())
134+
}
135+
108136
val comparator = createComparator(df, sortKeys, isDesc)
109137

110-
return df.sortWith(comparator)
138+
return df.sortWithLazy(comparator)
111139
}
112140

113141
private fun createComparator(
@@ -153,6 +181,31 @@ public object KotlinNotebookPluginUtils {
153181
return if (desc) comparator.reversed() else comparator
154182
}
155183

184+
private fun <T> DataFrame<T>.sortWithLazy(comparator: Comparator<DataRow<T>>): DataFrame<T> {
185+
val permutation = rows().sortedWith(comparator).map { it.index() }
186+
return SortedDataFrameView(this, permutation)
187+
}
188+
189+
private class SortedDataFrameView<T>(private val source: DataFrame<T>, private val permutation: List<Int>) :
190+
DataFrame<T> by source {
191+
192+
override operator fun get(index: Int): DataRow<T> = source[permutation[index]]
193+
194+
override operator fun get(range: IntRange): DataFrame<T> {
195+
val indices = range.map { permutation[it] }
196+
return source.getRows(indices)
197+
}
198+
199+
override operator fun get(indices: Iterable<Int>): DataFrame<T> {
200+
val mappedIndices = indices.map { permutation[it] }
201+
return source.getRows(mappedIndices)
202+
}
203+
204+
override fun get(columnName: String): AnyCol {
205+
return super.get(columnName)[permutation]
206+
}
207+
}
208+
156209
internal fun isDataframeConvertable(dataframeLike: Any?): Boolean =
157210
when (dataframeLike) {
158211
is Pivot<*>,
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
@file:Suppress("EmptyRange")
2+
3+
package org.jetbrains.kotlinx.dataframe
4+
5+
import kotlinx.benchmark.Benchmark
6+
import kotlinx.benchmark.BenchmarkMode
7+
import kotlinx.benchmark.BenchmarkTimeUnit
8+
import kotlinx.benchmark.Measurement
9+
import kotlinx.benchmark.Mode
10+
import kotlinx.benchmark.OutputTimeUnit
11+
import kotlinx.benchmark.Param
12+
import kotlinx.benchmark.Scope
13+
import kotlinx.benchmark.Setup
14+
import kotlinx.benchmark.State
15+
import kotlinx.benchmark.Warmup
16+
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
17+
import org.jetbrains.kotlinx.dataframe.api.toColumn
18+
import org.jetbrains.kotlinx.dataframe.api.toDataFrame
19+
import org.jetbrains.kotlinx.dataframe.jupyter.KotlinNotebookPluginUtils
20+
import kotlin.random.Random
21+
22+
@State(Scope.Benchmark)
23+
@Warmup(iterations = 5, time = 1)
24+
@Measurement(iterations = 5, time = 1)
25+
@BenchmarkMode(Mode.AverageTime)
26+
@OutputTimeUnit(BenchmarkTimeUnit.MILLISECONDS)
27+
open class SortingBenchmark {
28+
29+
@Param("10000", "100000", "1000000")
30+
var size: Int = 0
31+
32+
@Param("int", "string", "double", "category", "list", "frame")
33+
lateinit var columnType: String
34+
35+
private lateinit var df: DataFrame<*>
36+
private lateinit var columnPath: List<String>
37+
38+
@Setup
39+
fun setup() {
40+
val random = Random(42)
41+
df = (0 until size).toDataFrame {
42+
"int" from { it }
43+
"string" from { "name_${random.nextInt(1000)}" }
44+
"double" from { random.nextDouble() }
45+
"category" from { listOf("A", "B", "C", "D").random(random) }
46+
"list" from { List(random.nextInt(1, 20)) { "tag$it" } }
47+
"frame" from {
48+
dataFrameOf("x" to List(random.nextInt(1, 50)) { random.nextInt() }.toColumn())
49+
}
50+
}
51+
columnPath = listOf(columnType)
52+
}
53+
54+
@Benchmark
55+
fun sort(): DataFrame<*> {
56+
val sorted = KotlinNotebookPluginUtils.sortByColumns(df, listOf(columnPath), listOf(false))
57+
return KotlinNotebookPluginUtils.getRowsSubsetForRendering(sorted, 0, 20).value
58+
}
59+
}

0 commit comments

Comments
 (0)