Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .semversioner/next-release/patch-20260213160631396575.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"type": "patch",
"description": "add csv table smoke tests"
}
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ async def read_dataframe(self, table_name: str) -> pd.DataFrame:
# Handle empty CSV (pandas can't parse files with no columns)
if not csv_data or csv_data.strip() == "":
return pd.DataFrame()
return pd.read_csv(StringIO(csv_data))
return pd.read_csv(StringIO(csv_data), keep_default_na=False)
except Exception:
logger.exception("error loading table from storage: %s", filename)
raise
Expand Down
21 changes: 12 additions & 9 deletions tests/fixtures/text/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,12 @@
300
],
"max_runtime": 30,
"nan_allowed_columns": [
"description"
],
"expected_artifacts": [
"entities.parquet",
"relationships.parquet"
"entities.csv",
"relationships.csv"
]
},
"create_communities": {
Expand All @@ -32,7 +35,7 @@
30
],
"max_runtime": 30,
"expected_artifacts": ["communities.parquet"]
"expected_artifacts": ["communities.csv"]
},
"create_community_reports_text": {
"row_range": [
Expand All @@ -51,7 +54,7 @@
"size"
],
"max_runtime": 2000,
"expected_artifacts": ["community_reports.parquet"]
"expected_artifacts": ["community_reports.csv"]
},
"create_final_text_units": {
"row_range": [
Expand All @@ -64,7 +67,7 @@
"covariate_ids"
],
"max_runtime": 30,
"expected_artifacts": ["text_units.parquet"]
"expected_artifacts": ["text_units.csv"]
},
"create_final_documents": {
"row_range": [
Expand All @@ -75,7 +78,7 @@
"raw_data"
],
"max_runtime": 30,
"expected_artifacts": ["documents.parquet"]
"expected_artifacts": ["documents.csv"]
},
"generate_text_embeddings": {
"row_range": [
Expand All @@ -84,9 +87,9 @@
],
"max_runtime": 150,
"expected_artifacts": [
"embeddings.text_unit_text.parquet",
"embeddings.entity_description.parquet",
"embeddings.community_full_content.parquet"
"embeddings.text_unit_text.csv",
"embeddings.entity_description.csv",
"embeddings.community_full_content.csv"
]
}
},
Expand Down
3 changes: 3 additions & 0 deletions tests/fixtures/text/settings.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ vector_store:
api_key: ${AZURE_AI_SEARCH_API_KEY}
container_name: "simple_text_ci"

table_provider:
type: csv

community_reports:
prompt: "prompts/community_report.txt"
max_length: 2000
Expand Down
38 changes: 21 additions & 17 deletions tests/smoke/test_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,25 +178,29 @@ def __assert_indexer_outputs(
for artifact in workflow_artifacts:
if artifact.endswith(".parquet"):
output_df = pd.read_parquet(output_path / artifact)

# Check number of rows between range
assert (
config["row_range"][0]
<= len(output_df)
<= config["row_range"][1]
), (
f"Expected between {config['row_range'][0]} and {config['row_range'][1]}, found: {len(output_df)} for file: {artifact}"
elif artifact.endswith(".csv"):
output_df = pd.read_csv(
output_path / artifact, keep_default_na=False
)
else:
continue

# Check number of rows between range
assert (
config["row_range"][0] <= len(output_df) <= config["row_range"][1]
), (
f"Expected between {config['row_range'][0]} and {config['row_range'][1]}, found: {len(output_df)} for file: {artifact}"
)

# Get non-nan rows
nan_df = output_df.loc[
:,
~output_df.columns.isin(config.get("nan_allowed_columns", [])),
]
nan_df = nan_df[nan_df.isna().any(axis=1)]
assert len(nan_df) == 0, (
f"Found {len(nan_df)} rows with NaN values for file: {artifact} on columns: {nan_df.columns[nan_df.isna().any()].tolist()}"
)
# Get non-nan rows
nan_df = output_df.loc[
:,
~output_df.columns.isin(config.get("nan_allowed_columns", [])),
]
nan_df = nan_df[nan_df.isna().any(axis=1)]
assert len(nan_df) == 0, (
f"Found {len(nan_df)} rows with NaN values for file: {artifact} on columns: {nan_df.columns[nan_df.isna().any()].tolist()}"
)

def __run_query(self, root: Path, query_config: dict[str, str]):
command = [
Expand Down