Skip to content

Commit 062d18c

Browse files
Automated commit of generated code
1 parent 426adc7 commit 062d18c

File tree

5 files changed

+241
-49
lines changed

5 files changed

+241
-49
lines changed

core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/DataFrame.kt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,9 +102,11 @@ public interface DataFrame<out T> :
102102
@RequiredByIntellijPlugin
103103
public operator fun get(index: Int): DataRow<T>
104104

105-
public operator fun get(indices: Iterable<Int>): DataFrame<T> = getRows(indices)
105+
public operator fun get(indices: Iterable<Int>): DataFrame<T> =
106+
columns().map { col -> col[indices] }.toDataFrame().cast()
106107

107-
public operator fun get(range: IntRange): DataFrame<T> = getRows(range)
108+
public operator fun get(range: IntRange): DataFrame<T> =
109+
if (range == indices()) this else columns().map { col -> col[range] }.toDataFrame().cast()
108110

109111
public operator fun get(first: IntRange, vararg ranges: IntRange): DataFrame<T> =
110112
getRows(headPlusArray(first, ranges).asSequence().flatMap { it.asSequence() }.asIterable())

core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/DataFrameGet.kt

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,9 @@ public fun <T> DataFrame<T>.getColumns(vararg columns: String): List<AnyCol> = g
4545

4646
public fun <T> DataFrame<T>.getColumnIndex(col: AnyCol): Int = getColumnIndex(col.name())
4747

48-
public fun <T> DataFrame<T>.getRows(range: IntRange): DataFrame<T> =
49-
if (range == indices()) this else columns().map { col -> col[range] }.toDataFrame().cast()
48+
public fun <T> DataFrame<T>.getRows(range: IntRange): DataFrame<T> = get(range)
5049

51-
public fun <T> DataFrame<T>.getRows(indices: Iterable<Int>): DataFrame<T> =
52-
columns().map { col -> col[indices] }.toDataFrame().cast()
50+
public fun <T> DataFrame<T>.getRows(indices: Iterable<Int>): DataFrame<T> = get(indices)
5351

5452
public fun <T> DataFrame<T>.getOrNull(index: Int): DataRow<T>? = if (index < 0 || index >= nrow) null else get(index)
5553

core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/KotlinNotebookPluginUtils.kt

Lines changed: 92 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package org.jetbrains.kotlinx.dataframe.jupyter
33
import org.jetbrains.kotlinx.dataframe.AnyCol
44
import org.jetbrains.kotlinx.dataframe.AnyFrame
55
import org.jetbrains.kotlinx.dataframe.AnyRow
6+
import org.jetbrains.kotlinx.dataframe.DataFrame
67
import org.jetbrains.kotlinx.dataframe.DataRow
78
import org.jetbrains.kotlinx.dataframe.annotations.RequiredByIntellijPlugin
89
import org.jetbrains.kotlinx.dataframe.api.Convert
@@ -29,13 +30,17 @@ import org.jetbrains.kotlinx.dataframe.api.at
2930
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
3031
import org.jetbrains.kotlinx.dataframe.api.frames
3132
import org.jetbrains.kotlinx.dataframe.api.getColumn
33+
import org.jetbrains.kotlinx.dataframe.api.getRows
3234
import org.jetbrains.kotlinx.dataframe.api.into
33-
import org.jetbrains.kotlinx.dataframe.api.sortWith
35+
import org.jetbrains.kotlinx.dataframe.api.isFrameColumn
36+
import org.jetbrains.kotlinx.dataframe.api.isList
37+
import org.jetbrains.kotlinx.dataframe.api.rows
3438
import org.jetbrains.kotlinx.dataframe.api.toDataFrame
3539
import org.jetbrains.kotlinx.dataframe.api.values
3640
import org.jetbrains.kotlinx.dataframe.api.valuesAreComparable
3741
import org.jetbrains.kotlinx.dataframe.columns.ColumnPath
3842
import org.jetbrains.kotlinx.dataframe.impl.ColumnNameGenerator
43+
import java.util.Arrays
3944

4045
/**
4146
* A class with utility methods for Kotlin Notebook Plugin integration.
@@ -68,6 +73,10 @@ public object KotlinNotebookPluginUtils {
6873
/**
6974
* Sorts a dataframe-like object by multiple columns.
7075
* If a column type is not comparable, sorting by string representation is applied instead.
76+
* Sorts DataFrames by their size because looking at the smallest / biggest groups after groupBy is very popular.
77+
*
78+
* Returns "lazily materialized" dataframe, which means get, getRows, take operation must be applied to turn it to a valid sorted dataframe.
79+
* "lazily materialized" - after sorting 1 million of rows and given the page size = 100, a dataframe with only 100 rows is created.
7180
*
7281
* @param dataFrameLike The dataframe-like object to sort.
7382
* @param columnPaths The list of columns to sort by. Each element in the list represents a column path
@@ -103,60 +112,100 @@ public object KotlinNotebookPluginUtils {
103112
ColumnPath(path)
104113
}
105114

106-
val comparator = createComparator(sortKeys, isDesc)
115+
if (sortKeys.size == 1) {
116+
val column = df.getColumn(sortKeys[0])
117+
118+
// Not sure how to have generic logic that would produce Comparator<Int> and Comparator<DataRow> without overhead
119+
// For now Comparator<DataRow> is needed for fallback case of sorting multiple columns. Although it's now impossible in UI
120+
// Please make sure to change both this and createColumnComparator
121+
val comparator: Comparator<Int> = when {
122+
column.valuesAreComparable() -> compareBy(nullsLast()) {
123+
column[it] as Comparable<Any>?
124+
}
125+
126+
column.isFrameColumn() -> compareBy { column[it].rowsCount() }
127+
128+
column.isList() -> compareBy { (column[it] as? List<*>)?.size ?: 0 }
129+
130+
else -> compareBy { column[it]?.toString() ?: "" }
131+
}
132+
133+
val finalComparator = if (isDesc[0]) comparator.reversed() else comparator
134+
135+
val permutation = Array(column.size()) { it }
136+
Arrays.parallelSort(permutation, finalComparator)
137+
return SortedDataFrameView(df, permutation.asList())
138+
}
139+
140+
val comparator = createComparator(df, sortKeys, isDesc)
107141

108-
return df.sortWith(comparator)
142+
return df.sortWithLazy(comparator)
109143
}
110144

111-
private fun createComparator(sortKeys: List<ColumnPath>, isDesc: List<Boolean>): Comparator<DataRow<*>> {
112-
return Comparator { row1, row2 ->
113-
for ((key, desc) in sortKeys.zip(isDesc)) {
114-
val comparisonResult = if (row1.df().getColumn(key).valuesAreComparable()) {
115-
compareComparableValues(row1, row2, key, desc)
116-
} else {
117-
compareStringValues(row1, row2, key, desc)
145+
private fun createComparator(
146+
df: AnyFrame,
147+
sortKeys: List<ColumnPath>,
148+
isDesc: List<Boolean>,
149+
): Comparator<DataRow<*>> {
150+
val columnComparators = sortKeys.zip(isDesc).map { (key, desc) ->
151+
val column = df.getColumn(key)
152+
createColumnComparator(column, desc)
153+
}
154+
155+
return when (columnComparators.size) {
156+
1 -> columnComparators.single()
157+
158+
else -> Comparator { row1, row2 ->
159+
for (comparator in columnComparators) {
160+
val result = comparator.compare(row1, row2)
161+
// If a comparison result is non-zero, we have resolved the ordering
162+
if (result != 0) return@Comparator result
118163
}
119-
// If a comparison result is non-zero, we have resolved the ordering
120-
if (comparisonResult != 0) return@Comparator comparisonResult
164+
// All comparisons are equal
165+
0
121166
}
122-
// All comparisons are equal
123-
0
124167
}
125168
}
126169

127-
@Suppress("UNCHECKED_CAST")
128-
private fun compareComparableValues(
129-
row1: DataRow<*>,
130-
row2: DataRow<*>,
131-
key: ColumnPath,
132-
desc: Boolean,
133-
): Int {
134-
val firstValue = row1.getValueOrNull(key) as Comparable<Any?>?
135-
val secondValue = row2.getValueOrNull(key) as Comparable<Any?>?
136-
137-
return when {
138-
firstValue == null && secondValue == null -> 0
139-
firstValue == null -> if (desc) 1 else -1
140-
secondValue == null -> if (desc) -1 else 1
141-
desc -> secondValue.compareTo(firstValue)
142-
else -> firstValue.compareTo(secondValue)
170+
private fun createColumnComparator(column: AnyCol, desc: Boolean): Comparator<DataRow<*>> {
171+
val comparator: Comparator<DataRow<*>> = when {
172+
column.valuesAreComparable() -> compareBy(nullsLast()) {
173+
column[it] as Comparable<Any?>?
174+
}
175+
176+
// Comparator shows a slight improvement in performance for this case
177+
column.isFrameColumn() -> Comparator { r1, r2 ->
178+
column[r1].rowsCount().compareTo(column[r2].rowsCount())
179+
}
180+
181+
column.isList() -> compareBy { (column[it] as? List<*>)?.size ?: 0 }
182+
183+
else -> compareBy { column[it]?.toString() ?: "" }
143184
}
185+
return if (desc) comparator.reversed() else comparator
144186
}
145187

146-
private fun compareStringValues(
147-
row1: DataRow<*>,
148-
row2: DataRow<*>,
149-
key: ColumnPath,
150-
desc: Boolean,
151-
): Int {
152-
val firstValue = (row1.getValueOrNull(key)?.toString() ?: "")
153-
val secondValue = (row2.getValueOrNull(key)?.toString() ?: "")
154-
155-
return if (desc) {
156-
secondValue.compareTo(firstValue)
157-
} else {
158-
firstValue.compareTo(secondValue)
188+
private fun <T> DataFrame<T>.sortWithLazy(comparator: Comparator<DataRow<T>>): DataFrame<T> {
189+
val permutation = rows().sortedWith(comparator).map { it.index() }
190+
return SortedDataFrameView(this, permutation)
191+
}
192+
193+
private class SortedDataFrameView<T>(private val source: DataFrame<T>, private val permutation: List<Int>) :
194+
DataFrame<T> by source {
195+
196+
override operator fun get(index: Int): DataRow<T> = source[permutation[index]]
197+
198+
override operator fun get(range: IntRange): DataFrame<T> {
199+
val indices = range.map { permutation[it] }
200+
return source.getRows(indices)
159201
}
202+
203+
override operator fun get(indices: Iterable<Int>): DataFrame<T> {
204+
val mappedIndices = indices.map { permutation[it] }
205+
return source.getRows(mappedIndices)
206+
}
207+
208+
override fun get(columnName: String): AnyCol = super.get(columnName)[permutation]
160209
}
161210

162211
internal fun isDataframeConvertable(dataframeLike: Any?): Boolean =
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
package org.jetbrains.kotlinx.dataframe
2+
3+
import io.kotest.matchers.shouldBe
4+
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
5+
import org.jetbrains.kotlinx.dataframe.api.toColumn
6+
import org.jetbrains.kotlinx.dataframe.jupyter.KotlinNotebookPluginUtils
7+
import org.junit.Test
8+
import kotlin.random.Random
9+
10+
/**
11+
* Other tests are located in Jupyter module:
12+
* org.jetbrains.kotlinx.dataframe.jupyter.RenderingTests
13+
*/
14+
class KotlinNotebookPluginUtilsTests {
15+
@Test
16+
fun `sort lists by size descending`() {
17+
val random = Random(123)
18+
val lists = List(20) { List(random.nextInt(1, 100)) { it } } + null
19+
val df = dataFrameOf("listColumn" to lists)
20+
21+
val res = KotlinNotebookPluginUtils.sortByColumns(df, listOf(listOf("listColumn")), desc = listOf(true))
22+
23+
res["listColumn"].values() shouldBe lists.sortedByDescending { it?.size ?: 0 }
24+
}
25+
26+
@Test
27+
fun `sort lists by size ascending`() {
28+
val lists = listOf(listOf(1, 2, 3), listOf(1), listOf(1, 2), null)
29+
val df = dataFrameOf("listColumn" to lists)
30+
31+
val res = KotlinNotebookPluginUtils.sortByColumns(df, listOf(listOf("listColumn")), desc = listOf(false))
32+
33+
res["listColumn"].values() shouldBe listOf(null, listOf(1), listOf(1, 2), listOf(1, 2, 3))
34+
}
35+
36+
@Test
37+
fun `sort empty lists`() {
38+
val lists = listOf(listOf(1, 2), emptyList(), listOf(1), emptyList())
39+
val df = dataFrameOf("listColumn" to lists)
40+
41+
val res = KotlinNotebookPluginUtils.sortByColumns(df, listOf(listOf("listColumn")), desc = listOf(true))
42+
43+
res["listColumn"].values() shouldBe listOf(listOf(1, 2), listOf(1), emptyList(), emptyList())
44+
}
45+
46+
@Test
47+
fun `sort lists with equal sizes preserves stability`() {
48+
val lists = listOf(listOf("a"), listOf("b"), listOf("c"))
49+
val df = dataFrameOf("listColumn" to lists)
50+
51+
val res = KotlinNotebookPluginUtils.sortByColumns(df, listOf(listOf("listColumn")), desc = listOf(true))
52+
53+
res["listColumn"].values() shouldBe lists
54+
}
55+
56+
@Test
57+
fun `sort frame column by row count descending`() {
58+
val frames = listOf(
59+
dataFrameOf("x" to listOf(1)),
60+
dataFrameOf("x" to listOf(1, 2, 3)),
61+
dataFrameOf("x" to listOf(1, 2)),
62+
DataFrame.empty(),
63+
)
64+
val df = dataFrameOf("nested" to frames.toColumn())
65+
66+
val res = KotlinNotebookPluginUtils.sortByColumns(df, listOf(listOf("nested")), desc = listOf(true))
67+
68+
res["nested"].values().map { (it as DataFrame<*>).rowsCount() } shouldBe listOf(3, 2, 1, 0)
69+
}
70+
71+
@Test
72+
fun `sort frame column by row count ascending`() {
73+
val frames = listOf(
74+
dataFrameOf("x" to listOf(1, 2, 3)),
75+
dataFrameOf("x" to listOf(1)),
76+
DataFrame.empty(),
77+
)
78+
val df = dataFrameOf("nested" to frames.toColumn())
79+
80+
val res = KotlinNotebookPluginUtils.sortByColumns(df, listOf(listOf("nested")), desc = listOf(false))
81+
82+
res["nested"].values().map { (it as DataFrame<*>).rowsCount() } shouldBe listOf(0, 1, 3)
83+
}
84+
}
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
@file:Suppress("EmptyRange")
2+
3+
package org.jetbrains.kotlinx.dataframe
4+
5+
import kotlinx.benchmark.Benchmark
6+
import kotlinx.benchmark.BenchmarkMode
7+
import kotlinx.benchmark.BenchmarkTimeUnit
8+
import kotlinx.benchmark.Measurement
9+
import kotlinx.benchmark.Mode
10+
import kotlinx.benchmark.OutputTimeUnit
11+
import kotlinx.benchmark.Param
12+
import kotlinx.benchmark.Scope
13+
import kotlinx.benchmark.Setup
14+
import kotlinx.benchmark.State
15+
import kotlinx.benchmark.Warmup
16+
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
17+
import org.jetbrains.kotlinx.dataframe.api.toColumn
18+
import org.jetbrains.kotlinx.dataframe.api.toDataFrame
19+
import org.jetbrains.kotlinx.dataframe.jupyter.KotlinNotebookPluginUtils
20+
import kotlin.random.Random
21+
22+
@State(Scope.Benchmark)
23+
@Warmup(iterations = 5, time = 1)
24+
@Measurement(iterations = 5, time = 1)
25+
@BenchmarkMode(Mode.AverageTime)
26+
@OutputTimeUnit(BenchmarkTimeUnit.MILLISECONDS)
27+
open class SortingBenchmark {
28+
29+
@Param("10000", "100000", "1000000")
30+
var size: Int = 0
31+
32+
@Param("int", "string", "double", "category", "list", "frame")
33+
lateinit var columnType: String
34+
35+
private lateinit var df: DataFrame<*>
36+
private lateinit var columnPath: List<String>
37+
38+
@Setup
39+
fun setup() {
40+
val random = Random(42)
41+
df = (0 until size).toDataFrame {
42+
"int" from { it }
43+
"string" from { "name_${random.nextInt(1000)}" }
44+
"double" from { random.nextDouble() }
45+
"category" from { listOf("A", "B", "C", "D").random(random) }
46+
"list" from { List(random.nextInt(1, 20)) { "tag$it" } }
47+
"frame" from {
48+
dataFrameOf("x" to List(random.nextInt(1, 50)) { random.nextInt() }.toColumn())
49+
}
50+
}
51+
columnPath = listOf(columnType)
52+
}
53+
54+
@Benchmark
55+
fun sort(): DataFrame<*> {
56+
val sorted = KotlinNotebookPluginUtils.sortByColumns(df, listOf(columnPath), listOf(false))
57+
return KotlinNotebookPluginUtils.getRowsSubsetForRendering(sorted, 0, 20).value
58+
}
59+
}

0 commit comments

Comments
 (0)