@@ -3,6 +3,7 @@ package org.jetbrains.kotlinx.dataframe.jupyter
33import org.jetbrains.kotlinx.dataframe.AnyCol
44import org.jetbrains.kotlinx.dataframe.AnyFrame
55import org.jetbrains.kotlinx.dataframe.AnyRow
6+ import org.jetbrains.kotlinx.dataframe.DataFrame
67import org.jetbrains.kotlinx.dataframe.DataRow
78import org.jetbrains.kotlinx.dataframe.annotations.RequiredByIntellijPlugin
89import org.jetbrains.kotlinx.dataframe.api.Convert
@@ -29,13 +30,17 @@ import org.jetbrains.kotlinx.dataframe.api.at
2930import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
3031import org.jetbrains.kotlinx.dataframe.api.frames
3132import org.jetbrains.kotlinx.dataframe.api.getColumn
33+ import org.jetbrains.kotlinx.dataframe.api.getRows
3234import org.jetbrains.kotlinx.dataframe.api.into
33- import org.jetbrains.kotlinx.dataframe.api.sortWith
35+ import org.jetbrains.kotlinx.dataframe.api.isFrameColumn
36+ import org.jetbrains.kotlinx.dataframe.api.isList
37+ import org.jetbrains.kotlinx.dataframe.api.rows
3438import org.jetbrains.kotlinx.dataframe.api.toDataFrame
3539import org.jetbrains.kotlinx.dataframe.api.values
3640import org.jetbrains.kotlinx.dataframe.api.valuesAreComparable
3741import org.jetbrains.kotlinx.dataframe.columns.ColumnPath
3842import org.jetbrains.kotlinx.dataframe.impl.ColumnNameGenerator
43+ import java.util.Arrays
3944
4045/* *
4146 * A class with utility methods for Kotlin Notebook Plugin integration.
@@ -68,6 +73,10 @@ public object KotlinNotebookPluginUtils {
6873 /* *
6974 * Sorts a dataframe-like object by multiple columns.
7075 * If a column type is not comparable, sorting by string representation is applied instead.
76+ * Sorts DataFrames by their size because looking at the smallest / biggest groups after groupBy is very popular.
77+ *
78+ * Returns "lazily materialized" dataframe, which means get, getRows, take operation must be applied to turn it to a valid sorted dataframe.
79+ * "lazily materialized" - after sorting 1 million of rows and given the page size = 100, a dataframe with only 100 rows is created.
7180 *
7281 * @param dataFrameLike The dataframe-like object to sort.
7382 * @param columnPaths The list of columns to sort by. Each element in the list represents a column path
@@ -103,60 +112,100 @@ public object KotlinNotebookPluginUtils {
103112 ColumnPath (path)
104113 }
105114
106- val comparator = createComparator(sortKeys, isDesc)
115+ if (sortKeys.size == 1 ) {
116+ val column = df.getColumn(sortKeys[0 ])
117+
118+ // Not sure how to have generic logic that would produce Comparator<Int> and Comparator<DataRow> without overhead
119+ // For now Comparator<DataRow> is needed for fallback case of sorting multiple columns. Although it's now impossible in UI
120+ // Please make sure to change both this and createColumnComparator
121+ val comparator: Comparator <Int > = when {
122+ column.valuesAreComparable() -> compareBy(nullsLast()) {
123+ column[it] as Comparable <Any >?
124+ }
125+
126+ column.isFrameColumn() -> compareBy { column[it].rowsCount() }
127+
128+ column.isList() -> compareBy { (column[it] as ? List <* >)?.size ? : 0 }
129+
130+ else -> compareBy { column[it]?.toString() ? : " " }
131+ }
132+
133+ val finalComparator = if (isDesc[0 ]) comparator.reversed() else comparator
134+
135+ val permutation = Array (column.size()) { it }
136+ Arrays .parallelSort(permutation, finalComparator)
137+ return SortedDataFrameView (df, permutation.asList())
138+ }
139+
140+ val comparator = createComparator(df, sortKeys, isDesc)
107141
108- return df.sortWith (comparator)
142+ return df.sortWithLazy (comparator)
109143 }
110144
111- private fun createComparator (sortKeys : List <ColumnPath >, isDesc : List <Boolean >): Comparator <DataRow <* >> {
112- return Comparator { row1, row2 ->
113- for ((key, desc) in sortKeys.zip(isDesc)) {
114- val comparisonResult = if (row1.df().getColumn(key).valuesAreComparable()) {
115- compareComparableValues(row1, row2, key, desc)
116- } else {
117- compareStringValues(row1, row2, key, desc)
145+ private fun createComparator (
146+ df : AnyFrame ,
147+ sortKeys : List <ColumnPath >,
148+ isDesc : List <Boolean >,
149+ ): Comparator <DataRow <* >> {
150+ val columnComparators = sortKeys.zip(isDesc).map { (key, desc) ->
151+ val column = df.getColumn(key)
152+ createColumnComparator(column, desc)
153+ }
154+
155+ return when (columnComparators.size) {
156+ 1 -> columnComparators.single()
157+
158+ else -> Comparator { row1, row2 ->
159+ for (comparator in columnComparators) {
160+ val result = comparator.compare(row1, row2)
161+ // If a comparison result is non-zero, we have resolved the ordering
162+ if (result != 0 ) return @Comparator result
118163 }
119- // If a comparison result is non-zero, we have resolved the ordering
120- if (comparisonResult != 0 ) return @Comparator comparisonResult
164+ // All comparisons are equal
165+ 0
121166 }
122- // All comparisons are equal
123- 0
124167 }
125168 }
126169
127- @Suppress(" UNCHECKED_CAST" )
128- private fun compareComparableValues (
129- row1 : DataRow <* >,
130- row2 : DataRow <* >,
131- key : ColumnPath ,
132- desc : Boolean ,
133- ): Int {
134- val firstValue = row1.getValueOrNull(key) as Comparable <Any ?>?
135- val secondValue = row2.getValueOrNull(key) as Comparable <Any ?>?
136-
137- return when {
138- firstValue == null && secondValue == null -> 0
139- firstValue == null -> if (desc) 1 else - 1
140- secondValue == null -> if (desc) - 1 else 1
141- desc -> secondValue.compareTo(firstValue)
142- else -> firstValue.compareTo(secondValue)
170+ private fun createColumnComparator (column : AnyCol , desc : Boolean ): Comparator <DataRow <* >> {
171+ val comparator: Comparator <DataRow <* >> = when {
172+ column.valuesAreComparable() -> compareBy(nullsLast()) {
173+ column[it] as Comparable <Any ?>?
174+ }
175+
176+ // Comparator shows a slight improvement in performance for this case
177+ column.isFrameColumn() -> Comparator { r1, r2 ->
178+ column[r1].rowsCount().compareTo(column[r2].rowsCount())
179+ }
180+
181+ column.isList() -> compareBy { (column[it] as ? List <* >)?.size ? : 0 }
182+
183+ else -> compareBy { column[it]?.toString() ? : " " }
143184 }
185+ return if (desc) comparator.reversed() else comparator
144186 }
145187
146- private fun compareStringValues (
147- row1 : DataRow < * >,
148- row2 : DataRow < * >,
149- key : ColumnPath ,
150- desc : Boolean ,
151- ): Int {
152- val firstValue = (row1.getValueOrNull(key)?.toString() ? : " " )
153- val secondValue = (row2.getValueOrNull(key)?.toString() ? : " " )
154-
155- return if (desc) {
156- secondValue.compareTo(firstValue)
157- } else {
158- firstValue.compareTo(secondValue )
188+ private fun < T > DataFrame<T>. sortWithLazy ( comparator : Comparator < DataRow < T >>): DataFrame < T > {
189+ val permutation = rows().sortedWith(comparator).map { it.index() }
190+ return SortedDataFrameView ( this , permutation)
191+ }
192+
193+ private class SortedDataFrameView < T >( private val source : DataFrame < T >, private val permutation : List < Int >) :
194+ DataFrame < T > by source {
195+
196+ override operator fun get ( index : Int ): DataRow < T > = source[permutation[index]]
197+
198+ override operator fun get ( range : IntRange ): DataFrame < T > {
199+ val indices = range.map { permutation[it] }
200+ return source.getRows(indices )
159201 }
202+
203+ override operator fun get (indices : Iterable <Int >): DataFrame <T > {
204+ val mappedIndices = indices.map { permutation[it] }
205+ return source.getRows(mappedIndices)
206+ }
207+
208+ override fun get (columnName : String ): AnyCol = super .get(columnName)[permutation]
160209 }
161210
162211 internal fun isDataframeConvertable (dataframeLike : Any? ): Boolean =
0 commit comments