Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,15 @@
import datafusion as dfn
import numpy as np
import pytest
from datafusion import col, lit
from datafusion import functions as F


@pytest.fixture(autouse=True)
def _doctest_namespace(doctest_namespace: dict) -> None:
"""Add common imports to the doctest namespace."""
doctest_namespace["dfn"] = dfn
doctest_namespace["np"] = np
doctest_namespace["col"] = col
doctest_namespace["lit"] = lit
doctest_namespace["F"] = F
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,6 @@ ignore = [
"FIX002", # Allow TODO lines - consider removing at some point
"ISC001", # Recommended to ignore these rules when using with ruff-format
"N812", # Allow importing functions as `F`
"PD901", # Allow variable name df
"PLR0913", # Allow many arguments in function definition
"SLF001", # Allow accessing private members
"TD002", # Do not require author names in TODO statements
Expand Down Expand Up @@ -190,7 +189,7 @@ dev = [
"pytest-asyncio>=0.23.3",
"pytest>=7.4.4",
"pyyaml>=6.0.3",
"ruff>=0.9.1",
"ruff>=0.15.1",
"toml>=0.10.2",
]
docs = [
Expand Down
30 changes: 12 additions & 18 deletions python/datafusion/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,9 +371,8 @@ def with_fair_spill_pool(self, size: int) -> RuntimeEnvBuilder:
Returns:
A new :py:class:`RuntimeEnvBuilder` object with the updated setting.

Examples usage::

config = RuntimeEnvBuilder().with_fair_spill_pool(1024)
Examples:
>>> config = dfn.RuntimeEnvBuilder().with_fair_spill_pool(1024)
"""
self.config_internal = self.config_internal.with_fair_spill_pool(size)
return self
Expand All @@ -391,9 +390,8 @@ def with_greedy_memory_pool(self, size: int) -> RuntimeEnvBuilder:
Returns:
A new :py:class:`RuntimeEnvBuilder` object with the updated setting.

Example usage::

config = RuntimeEnvBuilder().with_greedy_memory_pool(1024)
Examples:
>>> config = dfn.RuntimeEnvBuilder().with_greedy_memory_pool(1024)
"""
self.config_internal = self.config_internal.with_greedy_memory_pool(size)
return self
Expand All @@ -407,9 +405,8 @@ def with_temp_file_path(self, path: str | pathlib.Path) -> RuntimeEnvBuilder:
Returns:
A new :py:class:`RuntimeEnvBuilder` object with the updated setting.

Example usage::

config = RuntimeEnvBuilder().with_temp_file_path("/tmp")
Examples:
>>> config = dfn.RuntimeEnvBuilder().with_temp_file_path("/tmp")
"""
self.config_internal = self.config_internal.with_temp_file_path(str(path))
return self
Expand Down Expand Up @@ -444,9 +441,8 @@ def with_allow_ddl(self, allow: bool = True) -> SQLOptions:
Returns:
A new :py:class:`SQLOptions` object with the updated setting.

Example usage::

options = SQLOptions().with_allow_ddl(True)
Examples:
>>> options = dfn.SQLOptions().with_allow_ddl(True)
"""
self.options_internal = self.options_internal.with_allow_ddl(allow)
return self
Expand All @@ -462,9 +458,8 @@ def with_allow_dml(self, allow: bool = True) -> SQLOptions:
Returns:
A new :py:class:`SQLOptions` object with the updated setting.

Example usage::

options = SQLOptions().with_allow_dml(True)
Examples:
>>> options = dfn.SQLOptions().with_allow_dml(True)
"""
self.options_internal = self.options_internal.with_allow_dml(allow)
return self
Expand All @@ -478,9 +473,8 @@ def with_allow_statements(self, allow: bool = True) -> SQLOptions:
Returns:
A new :py:class:SQLOptions` object with the updated setting.

Example usage::

options = SQLOptions().with_allow_statements(True)
Examples:
>>> options = dfn.SQLOptions().with_allow_statements(True)
"""
self.options_internal = self.options_internal.with_allow_statements(allow)
return self
Expand Down
84 changes: 51 additions & 33 deletions python/datafusion/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ def into_view(self, temporary: bool = False) -> Table:
>>> result[0].column("value").to_pylist()
[1]
"""
from datafusion.catalog import Table as _Table
from datafusion.catalog import Table as _Table # noqa: PLC0415

return _Table(self.df.into_view(temporary))

Expand Down Expand Up @@ -451,9 +451,20 @@ def drop(self, *columns: str) -> DataFrame:
Returns:
DataFrame with those columns removed in the projection.

Example Usage::
df.drop('a') # To drop a lower-cased column 'a'
df.drop('"a"') # To drop an upper-cased column 'A'
Examples:
To drop a lower-cased column 'a'

>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1, 2], "b": [3, 4]})
>>> df.drop("a").schema().names
['b']

Or to drop an upper-cased column 'A'

>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"A": [1, 2], "b": [3, 4]})
>>> df.drop('"A"').schema().names
['b']
"""
return DataFrame(self.df.drop(*columns))

Expand All @@ -468,11 +479,13 @@ def filter(self, *predicates: Expr | str) -> DataFrame:
that will be parsed against the DataFrame schema. If more complex logic is
required, see the logical operations in :py:mod:`~datafusion.functions`.

Example::

from datafusion import col, lit
df.filter(col("a") > lit(1))
df.filter("a > 1")
Examples:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1, 2, 3]})
>>> df.filter(col("a") > lit(1)).to_pydict()
{'a': [2, 3]}
>>> df.filter("a > 1").to_pydict()
{'a': [2, 3]}

Args:
predicates: Predicate expression(s) or SQL strings to filter the DataFrame.
Expand All @@ -495,14 +508,12 @@ def parse_sql_expr(self, expr: str) -> Expr:

The expression is created and processed against the current schema.

Example::

from datafusion import col, lit
df.parse_sql_expr("a > 1")

should produce:

col("a") > lit(1)
Examples:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1, 2, 3]})
>>> expr = df.parse_sql_expr("a > 1")
>>> df.filter(expr).to_pydict()
{'a': [2, 3]}

Args:
expr: Expression string to be converted to datafusion expression
Expand All @@ -519,10 +530,11 @@ def with_column(self, name: str, expr: Expr | str) -> DataFrame:
:func:`datafusion.col` or :func:`datafusion.lit`, or a SQL expression
string that will be parsed against the DataFrame schema.

Example::

from datafusion import col, lit
df.with_column("b", col("a") + lit(1))
Examples:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1, 2]})
>>> df.with_column("b", col("a") + lit(10)).to_pydict()
{'a': [1, 2], 'b': [11, 12]}

Args:
name: Name of the column to add.
Expand Down Expand Up @@ -885,10 +897,14 @@ def join_on(
built with :func:`datafusion.col`. On expressions are used to support
in-equality predicates. Equality predicates are correctly optimized.

Example::

from datafusion import col
df.join_on(other_df, col("id") == col("other_id"))
Examples:
>>> ctx = dfn.SessionContext()
>>> left = ctx.from_pydict({"a": [1, 2], "x": ["a", "b"]})
>>> right = ctx.from_pydict({"b": [1, 2], "y": ["c", "d"]})
>>> left.join_on(
... right, col("a") == col("b")
... ).sort(col("x")).to_pydict()
{'a': [1, 2], 'x': ['a', 'b'], 'b': [1, 2], 'y': ['c', 'd']}

Args:
right: Other DataFrame to join with.
Expand Down Expand Up @@ -1350,15 +1366,17 @@ def __aiter__(self) -> AsyncIterator[RecordBatch]:
def transform(self, func: Callable[..., DataFrame], *args: Any) -> DataFrame:
"""Apply a function to the current DataFrame which returns another DataFrame.

This is useful for chaining together multiple functions. For example::

def add_3(df: DataFrame) -> DataFrame:
return df.with_column("modified", lit(3))
This is useful for chaining together multiple functions.

def within_limit(df: DataFrame, limit: int) -> DataFrame:
return df.filter(col("a") < lit(limit)).distinct()

df = df.transform(modify_df).transform(within_limit, 4)
Examples:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1, 2, 3]})
>>> def add_3(df):
... return df.with_column("modified", dfn.lit(3))
>>> def within_limit(df: DataFrame, limit: int) -> DataFrame:
... return df.filter(col("a") < lit(limit)).distinct()
>>> df.transform(add_3).transform(within_limit, 4).sort("a").to_pydict()
{'a': [1, 2, 3], 'modified': [3, 3, 3]}

Args:
func: A callable function that takes a DataFrame as it's first argument
Expand Down
24 changes: 13 additions & 11 deletions python/datafusion/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ def sort_list_to_raw_sort_list(
return raw_sort_list


class Expr:
class Expr: # noqa: PLW1641
"""Expression object.

Expressions are one of the core concepts in DataFusion. See
Expand Down Expand Up @@ -1367,16 +1367,18 @@ def is_unbounded(self) -> bool:
class CaseBuilder:
"""Builder class for constructing case statements.

An example usage would be as follows::

import datafusion.functions as f
from datafusion import lit, col
df.select(
f.case(col("column_a"))
.when(lit(1), lit("One"))
.when(lit(2), lit("Two"))
.otherwise(lit("Unknown"))
)
Examples:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1, 2, 3]})
>>> result = df.select(
... dfn.functions.case(dfn.col("a"))
... .when(dfn.lit(1), dfn.lit("One"))
... .when(dfn.lit(2), dfn.lit("Two"))
... .otherwise(dfn.lit("Other"))
... .alias("label")
... )
>>> result.to_pydict()
{'label': ['One', 'Two', 'Other']}
"""

def __init__(self, case_builder: expr_internal.CaseBuilder) -> None:
Expand Down
4 changes: 2 additions & 2 deletions python/datafusion/input/location.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def build_table(
num_rows = 0 # Total number of rows in the file. Used for statistics
columns = []
if file_format == "parquet":
import pyarrow.parquet as pq
import pyarrow.parquet as pq # noqa: PLC0415

# Read the Parquet metadata
metadata = pq.read_metadata(input_item)
Expand All @@ -61,7 +61,7 @@ def build_table(
]

elif format == "csv":
import csv
import csv # noqa: PLC0415

# Consume header row and count number of rows for statistics.
# TODO: Possibly makes sense to have the eager number of rows
Expand Down
2 changes: 1 addition & 1 deletion python/datafusion/plan.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
]


class LogicalPlan:
class LogicalPlan: # noqa: PLW1641
"""Logical Plan.

A `LogicalPlan` is a node in a tree of relational operators (such as
Expand Down
Loading
Loading