-
Notifications
You must be signed in to change notification settings - Fork 77
Fast sorting of columns/table in Kotlin Notebook #1639
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
17f5a59
1e1f0c0
d73a447
d14eaac
c2f96cd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -3,6 +3,7 @@ package org.jetbrains.kotlinx.dataframe.jupyter | |||||||||||||||
| import org.jetbrains.kotlinx.dataframe.AnyCol | ||||||||||||||||
| import org.jetbrains.kotlinx.dataframe.AnyFrame | ||||||||||||||||
| import org.jetbrains.kotlinx.dataframe.AnyRow | ||||||||||||||||
| import org.jetbrains.kotlinx.dataframe.DataFrame | ||||||||||||||||
| import org.jetbrains.kotlinx.dataframe.DataRow | ||||||||||||||||
| import org.jetbrains.kotlinx.dataframe.annotations.RequiredByIntellijPlugin | ||||||||||||||||
| import org.jetbrains.kotlinx.dataframe.api.Convert | ||||||||||||||||
|
|
@@ -29,13 +30,17 @@ import org.jetbrains.kotlinx.dataframe.api.at | |||||||||||||||
| import org.jetbrains.kotlinx.dataframe.api.dataFrameOf | ||||||||||||||||
| import org.jetbrains.kotlinx.dataframe.api.frames | ||||||||||||||||
| import org.jetbrains.kotlinx.dataframe.api.getColumn | ||||||||||||||||
| import org.jetbrains.kotlinx.dataframe.api.getRows | ||||||||||||||||
| import org.jetbrains.kotlinx.dataframe.api.into | ||||||||||||||||
| import org.jetbrains.kotlinx.dataframe.api.sortWith | ||||||||||||||||
| import org.jetbrains.kotlinx.dataframe.api.isFrameColumn | ||||||||||||||||
| import org.jetbrains.kotlinx.dataframe.api.isList | ||||||||||||||||
| import org.jetbrains.kotlinx.dataframe.api.rows | ||||||||||||||||
| import org.jetbrains.kotlinx.dataframe.api.toDataFrame | ||||||||||||||||
| import org.jetbrains.kotlinx.dataframe.api.values | ||||||||||||||||
| import org.jetbrains.kotlinx.dataframe.api.valuesAreComparable | ||||||||||||||||
| import org.jetbrains.kotlinx.dataframe.columns.ColumnPath | ||||||||||||||||
| import org.jetbrains.kotlinx.dataframe.impl.ColumnNameGenerator | ||||||||||||||||
| import java.util.Arrays | ||||||||||||||||
|
|
||||||||||||||||
| /** | ||||||||||||||||
| * A class with utility methods for Kotlin Notebook Plugin integration. | ||||||||||||||||
|
|
@@ -68,6 +73,10 @@ public object KotlinNotebookPluginUtils { | |||||||||||||||
| /** | ||||||||||||||||
| * Sorts a dataframe-like object by multiple columns. | ||||||||||||||||
| * If a column type is not comparable, sorting by string representation is applied instead. | ||||||||||||||||
| * Sorts DataFrames by their size because looking at the smallest / biggest groups after groupBy is very popular. | ||||||||||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. and lists |
||||||||||||||||
| * | ||||||||||||||||
| * Returns "lazily materialized" dataframe, which means get, getRows, take operation must be applied to turn it to a valid sorted dataframe. | ||||||||||||||||
| * "lazily materialized" - after sorting 1 million of rows and given the page size = 100, a dataframe with only 100 rows is created. | ||||||||||||||||
| * | ||||||||||||||||
| * @param dataFrameLike The dataframe-like object to sort. | ||||||||||||||||
| * @param columnPaths The list of columns to sort by. Each element in the list represents a column path | ||||||||||||||||
|
|
@@ -103,60 +112,100 @@ public object KotlinNotebookPluginUtils { | |||||||||||||||
| ColumnPath(path) | ||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
| val comparator = createComparator(sortKeys, isDesc) | ||||||||||||||||
| if (sortKeys.size == 1) { | ||||||||||||||||
| val column = df.getColumn(sortKeys[0]) | ||||||||||||||||
|
|
||||||||||||||||
| // Not sure how to have generic logic that would produce Comparator<Int> and Comparator<DataRow> without overhead | ||||||||||||||||
| // For now Comparator<DataRow> is needed for fallback case of sorting multiple columns. Although it's now impossible in UI | ||||||||||||||||
koperagen marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||||
| // Please make sure to change both this and createColumnComparator | ||||||||||||||||
|
Comment on lines
+118
to
+120
|
||||||||||||||||
| // Not sure how to have generic logic that would produce Comparator<Int> and Comparator<DataRow> without overhead | |
| // For now Comparator<DataRow> is needed for fallback case of sorting multiple columns. Although it's now impossible in UI | |
| // Please make sure to change both this and createColumnComparator | |
| // Not sure how to have generic logic that would produce Comparator<Int> and Comparator<DataRow> without overhead. | |
| // For now Comparator<DataRow> is needed as a fallback for multi-column sorting. Although multi-column sorting | |
| // is currently disabled in the Kotlin Notebook Plugin UI, this fallback is retained for API compatibility and | |
| // potential future re-enablement. Please make sure to change both this and createColumnComparator. |
koperagen marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@copilot is it thread-safe to use parallesort here?
koperagen marked this conversation as resolved.
Show resolved
Hide resolved
koperagen marked this conversation as resolved.
Show resolved
Hide resolved
koperagen marked this conversation as resolved.
Show resolved
Hide resolved
koperagen marked this conversation as resolved.
Show resolved
Hide resolved
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,84 @@ | ||
| package org.jetbrains.kotlinx.dataframe | ||
|
|
||
| import io.kotest.matchers.shouldBe | ||
| import org.jetbrains.kotlinx.dataframe.api.dataFrameOf | ||
| import org.jetbrains.kotlinx.dataframe.api.toColumn | ||
| import org.jetbrains.kotlinx.dataframe.jupyter.KotlinNotebookPluginUtils | ||
| import org.junit.Test | ||
| import kotlin.random.Random | ||
|
|
||
| /** | ||
| * Other tests are located in Jupyter module: | ||
| * org.jetbrains.kotlinx.dataframe.jupyter.RenderingTests | ||
| */ | ||
| class KotlinNotebookPluginUtilsTests { | ||
| @Test | ||
| fun `sort lists by size descending`() { | ||
| val random = Random(123) | ||
| val lists = List(20) { List(random.nextInt(1, 100)) { it } } + null | ||
| val df = dataFrameOf("listColumn" to lists) | ||
|
|
||
| val res = KotlinNotebookPluginUtils.sortByColumns(df, listOf(listOf("listColumn")), desc = listOf(true)) | ||
|
|
||
| res["listColumn"].values() shouldBe lists.sortedByDescending { it?.size ?: 0 } | ||
| } | ||
|
Comment on lines
+16
to
+24
|
||
|
|
||
| @Test | ||
| fun `sort lists by size ascending`() { | ||
| val lists = listOf(listOf(1, 2, 3), listOf(1), listOf(1, 2), null) | ||
| val df = dataFrameOf("listColumn" to lists) | ||
|
|
||
| val res = KotlinNotebookPluginUtils.sortByColumns(df, listOf(listOf("listColumn")), desc = listOf(false)) | ||
|
|
||
| res["listColumn"].values() shouldBe listOf(null, listOf(1), listOf(1, 2), listOf(1, 2, 3)) | ||
| } | ||
|
|
||
| @Test | ||
| fun `sort empty lists`() { | ||
| val lists = listOf(listOf(1, 2), emptyList(), listOf(1), emptyList()) | ||
| val df = dataFrameOf("listColumn" to lists) | ||
|
|
||
| val res = KotlinNotebookPluginUtils.sortByColumns(df, listOf(listOf("listColumn")), desc = listOf(true)) | ||
|
|
||
| res["listColumn"].values() shouldBe listOf(listOf(1, 2), listOf(1), emptyList(), emptyList()) | ||
| } | ||
|
Comment on lines
+37
to
+44
|
||
|
|
||
| @Test | ||
| fun `sort lists with equal sizes preserves stability`() { | ||
| val lists = listOf(listOf("a"), listOf("b"), listOf("c")) | ||
| val df = dataFrameOf("listColumn" to lists) | ||
|
|
||
| val res = KotlinNotebookPluginUtils.sortByColumns(df, listOf(listOf("listColumn")), desc = listOf(true)) | ||
|
|
||
| res["listColumn"].values() shouldBe lists | ||
| } | ||
|
|
||
| @Test | ||
| fun `sort frame column by row count descending`() { | ||
| val frames = listOf( | ||
| dataFrameOf("x" to listOf(1)), | ||
| dataFrameOf("x" to listOf(1, 2, 3)), | ||
| dataFrameOf("x" to listOf(1, 2)), | ||
| DataFrame.empty(), | ||
| ) | ||
| val df = dataFrameOf("nested" to frames.toColumn()) | ||
|
|
||
| val res = KotlinNotebookPluginUtils.sortByColumns(df, listOf(listOf("nested")), desc = listOf(true)) | ||
|
|
||
| res["nested"].values().map { (it as DataFrame<*>).rowsCount() } shouldBe listOf(3, 2, 1, 0) | ||
| } | ||
|
|
||
| @Test | ||
| fun `sort frame column by row count ascending`() { | ||
| val frames = listOf( | ||
| dataFrameOf("x" to listOf(1, 2, 3)), | ||
| dataFrameOf("x" to listOf(1)), | ||
| DataFrame.empty(), | ||
| ) | ||
| val df = dataFrameOf("nested" to frames.toColumn()) | ||
|
|
||
| val res = KotlinNotebookPluginUtils.sortByColumns(df, listOf(listOf("nested")), desc = listOf(false)) | ||
|
|
||
| res["nested"].values().map { (it as DataFrame<*>).rowsCount() } shouldBe listOf(0, 1, 3) | ||
| } | ||
| } | ||
koperagen marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,59 @@ | ||
| @file:Suppress("EmptyRange") | ||
|
|
||
| package org.jetbrains.kotlinx.dataframe | ||
|
|
||
| import kotlinx.benchmark.Benchmark | ||
| import kotlinx.benchmark.BenchmarkMode | ||
| import kotlinx.benchmark.BenchmarkTimeUnit | ||
| import kotlinx.benchmark.Measurement | ||
| import kotlinx.benchmark.Mode | ||
| import kotlinx.benchmark.OutputTimeUnit | ||
| import kotlinx.benchmark.Param | ||
| import kotlinx.benchmark.Scope | ||
| import kotlinx.benchmark.Setup | ||
| import kotlinx.benchmark.State | ||
| import kotlinx.benchmark.Warmup | ||
| import org.jetbrains.kotlinx.dataframe.api.dataFrameOf | ||
| import org.jetbrains.kotlinx.dataframe.api.toColumn | ||
| import org.jetbrains.kotlinx.dataframe.api.toDataFrame | ||
| import org.jetbrains.kotlinx.dataframe.jupyter.KotlinNotebookPluginUtils | ||
| import kotlin.random.Random | ||
|
|
||
| @State(Scope.Benchmark) | ||
| @Warmup(iterations = 5, time = 1) | ||
| @Measurement(iterations = 5, time = 1) | ||
| @BenchmarkMode(Mode.AverageTime) | ||
| @OutputTimeUnit(BenchmarkTimeUnit.MILLISECONDS) | ||
| open class SortingBenchmark { | ||
|
|
||
| @Param("10000", "100000", "1000000") | ||
| var size: Int = 0 | ||
|
|
||
| @Param("int", "string", "double", "category", "list", "frame") | ||
| lateinit var columnType: String | ||
|
|
||
| private lateinit var df: DataFrame<*> | ||
| private lateinit var columnPath: List<String> | ||
|
|
||
| @Setup | ||
| fun setup() { | ||
| val random = Random(42) | ||
| df = (0 until size).toDataFrame { | ||
| "int" from { it } | ||
| "string" from { "name_${random.nextInt(1000)}" } | ||
| "double" from { random.nextDouble() } | ||
| "category" from { listOf("A", "B", "C", "D").random(random) } | ||
| "list" from { List(random.nextInt(1, 20)) { "tag$it" } } | ||
| "frame" from { | ||
| dataFrameOf("x" to List(random.nextInt(1, 50)) { random.nextInt() }.toColumn()) | ||
| } | ||
| } | ||
| columnPath = listOf(columnType) | ||
| } | ||
|
|
||
| @Benchmark | ||
| fun sort(): DataFrame<*> { | ||
| val sorted = KotlinNotebookPluginUtils.sortByColumns(df, listOf(columnPath), listOf(false)) | ||
| return KotlinNotebookPluginUtils.getRowsSubsetForRendering(sorted, 0, 20).value | ||
| } | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Happy to see first benchmark in our project!