Skip to content

Commit bdd4d9d

Browse files
committed
feat: add aggregation methods (mode, std, variance)
1 parent 4ae4805 commit bdd4d9d

7 files changed

Lines changed: 510 additions & 0 deletions

File tree

src/methods/aggregation/mode.js

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
/**
2+
* Finds the most frequent value in a column.
3+
* If multiple values have the same highest frequency, returns the first one encountered.
4+
*
5+
* @param {{ validateColumn(frame, column): void }} deps
6+
* @returns {(frame: TinyFrame, column: string) => any|null}
7+
*/
8+
export const mode =
9+
({ validateColumn }) =>
10+
(frame, column) => {
11+
validateColumn(frame, column);
12+
13+
const values = frame.columns[column];
14+
const length = values.length;
15+
16+
if (length === 0) {
17+
return null;
18+
}
19+
20+
// Count frequency of each value
21+
const counts = new Map();
22+
let maxCount = 0;
23+
let modeValue = null;
24+
let hasValidValue = false;
25+
26+
for (let i = 0; i < length; i++) {
27+
const value = values[i];
28+
29+
// Skip NaN, null, and undefined values
30+
if (value === null || value === undefined || Number.isNaN(value)) {
31+
continue;
32+
}
33+
34+
hasValidValue = true;
35+
36+
// Get current count or initialize to 0
37+
const count = counts.get(value) || 0;
38+
const newCount = count + 1;
39+
40+
// Update the map with new count
41+
counts.set(value, newCount);
42+
43+
// Update mode if this value has a higher frequency
44+
if (newCount > maxCount) {
45+
maxCount = newCount;
46+
modeValue = value;
47+
}
48+
}
49+
50+
return hasValidValue ? modeValue : null;
51+
};

src/methods/aggregation/std.js

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
/**
2+
* Calculates the standard deviation of values in a column.
3+
* By default, calculates the population standard deviation.
4+
* Set 'sample' parameter to true for sample standard deviation.
5+
*
6+
* @param {{ validateColumn(frame, column): void }} deps
7+
* @returns {(frame: TinyFrame, column: string, options?: { sample?: boolean }) => number|null}
8+
*/
9+
export const std =
10+
({ validateColumn }) =>
11+
(frame, column, options = {}) => {
12+
validateColumn(frame, column);
13+
14+
const values = frame.columns[column];
15+
const sample = options.sample || false;
16+
17+
// Filter out non-numeric values and convert to numbers
18+
const numericValues = [];
19+
for (let i = 0; i < values.length; i++) {
20+
const value = values[i];
21+
if (value === null || value === undefined || Number.isNaN(value)) {
22+
continue;
23+
}
24+
25+
const numValue = Number(value);
26+
if (!Number.isNaN(numValue)) {
27+
numericValues.push(numValue);
28+
}
29+
}
30+
31+
const length = numericValues.length;
32+
if (length === 0) {
33+
return null;
34+
}
35+
36+
// Calculate mean
37+
let sum = 0;
38+
for (let i = 0; i < length; i++) {
39+
sum += numericValues[i];
40+
}
41+
const mean = sum / length;
42+
43+
// Calculate sum of squared differences from the mean
44+
let sumSquaredDiff = 0;
45+
for (let i = 0; i < length; i++) {
46+
const diff = numericValues[i] - mean;
47+
sumSquaredDiff += diff * diff;
48+
}
49+
50+
// For population standard deviation, divide by n
51+
// For sample standard deviation, divide by (n-1)
52+
const divisor = sample ? length - 1 : length;
53+
54+
// Handle edge case: if sample=true and there's only one value
55+
if (divisor === 0) {
56+
return null;
57+
}
58+
59+
// Calculate standard deviation
60+
return Math.sqrt(sumSquaredDiff / divisor);
61+
};
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
/**
2+
* Calculates the variance of values in a column.
3+
* By default, calculates the population variance.
4+
* Set 'sample' parameter to true for sample variance.
5+
*
6+
* @param {{ validateColumn(frame, column): void }} deps
7+
* @returns {(frame: TinyFrame, column: string, options?: { sample?: boolean }) => number|null}
8+
*/
9+
export const variance =
10+
({ validateColumn }) =>
11+
(frame, column, options = {}) => {
12+
validateColumn(frame, column);
13+
14+
const values = frame.columns[column];
15+
const sample = options.sample || false;
16+
17+
// Filter out non-numeric values and convert to numbers
18+
const numericValues = [];
19+
for (let i = 0; i < values.length; i++) {
20+
const value = values[i];
21+
if (value === null || value === undefined || Number.isNaN(value)) {
22+
continue;
23+
}
24+
25+
const numValue = Number(value);
26+
if (!Number.isNaN(numValue)) {
27+
numericValues.push(numValue);
28+
}
29+
}
30+
31+
const length = numericValues.length;
32+
if (length === 0) {
33+
return null;
34+
}
35+
36+
// Calculate mean
37+
let sum = 0;
38+
for (let i = 0; i < length; i++) {
39+
sum += numericValues[i];
40+
}
41+
const mean = sum / length;
42+
43+
// Calculate sum of squared differences from the mean
44+
let sumSquaredDiff = 0;
45+
for (let i = 0; i < length; i++) {
46+
const diff = numericValues[i] - mean;
47+
sumSquaredDiff += diff * diff;
48+
}
49+
50+
// For population variance, divide by n
51+
// For sample variance, divide by (n-1)
52+
const divisor = sample ? length - 1 : length;
53+
54+
// Handle edge case: if sample=true and there's only one value
55+
if (divisor === 0) {
56+
return null;
57+
}
58+
59+
// Calculate variance
60+
return sumSquaredDiff / divisor;
61+
};

src/methods/raw.js

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,6 @@ export { min } from './aggregation/min.js';
1010
export { max } from './aggregation/max.js';
1111
export { last } from './aggregation/last.js';
1212
export { median } from './aggregation/median.js';
13+
export { mode } from './aggregation/mode.js';
14+
export { std } from './aggregation/std.js';
15+
export { variance } from './aggregation/variance.js';
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
import { describe, it, expect } from 'vitest';
2+
import { DataFrame } from '../../../src/core/DataFrame.js';
3+
import { mode } from '../../../src/methods/aggregation/mode.js';
4+
5+
describe('mode method', () => {
6+
// Create test data
7+
const testData = [
8+
{ value: 30, category: 'A', mixed: '20' },
9+
{ value: 10, category: 'B', mixed: 30 },
10+
{ value: 30, category: 'A', mixed: null },
11+
{ value: 40, category: 'C', mixed: undefined },
12+
{ value: 30, category: 'B', mixed: NaN },
13+
{ value: 20, category: 'B', mixed: '20' },
14+
];
15+
16+
const df = DataFrame.create(testData);
17+
18+
it('should find the most frequent value in a column', () => {
19+
// Call mode function directly
20+
const modeFn = mode({ validateColumn: () => {} });
21+
const result = modeFn(df._frame, 'value');
22+
23+
// Check that the mode is correct
24+
expect(result).toBe(30); // 30 appears 3 times, more than any other value
25+
});
26+
27+
it('should handle mixed data types by treating them as distinct', () => {
28+
// Call mode function directly
29+
const modeFn = mode({ validateColumn: () => {} });
30+
const result = modeFn(df._frame, 'mixed');
31+
32+
// Check that the mode is correct (only valid values are considered)
33+
expect(result).toBe('20'); // '20' appears twice (string '20', not number 20)
34+
});
35+
36+
it('should return null for a column with no valid values', () => {
37+
// Create data with only invalid values
38+
const invalidData = [
39+
{ invalid: null },
40+
{ invalid: undefined },
41+
{ invalid: NaN },
42+
];
43+
44+
const invalidDf = DataFrame.create(invalidData);
45+
46+
// Call mode function directly
47+
const modeFn = mode({ validateColumn: () => {} });
48+
const result = modeFn(invalidDf._frame, 'invalid');
49+
50+
// Check that the result is null (no valid values)
51+
expect(result).toBe(null);
52+
});
53+
54+
it('should return the first encountered value if multiple values have the same highest frequency', () => {
55+
// Create data with multiple modes
56+
const multiModeData = [
57+
{ value: 10 },
58+
{ value: 20 },
59+
{ value: 10 },
60+
{ value: 30 },
61+
{ value: 20 },
62+
{ value: 30 },
63+
];
64+
65+
const multiModeDf = DataFrame.create(multiModeData);
66+
67+
// Call mode function directly
68+
const modeFn = mode({ validateColumn: () => {} });
69+
const result = modeFn(multiModeDf._frame, 'value');
70+
71+
// Check that one of the modes is returned (all appear twice)
72+
expect([10, 20, 30]).toContain(result);
73+
});
74+
75+
it('should throw an error for non-existent column', () => {
76+
// Create a validator that throws an error for non-existent column
77+
const validateColumn = (frame, column) => {
78+
if (!(column in frame.columns)) {
79+
throw new Error(`Column '${column}' not found`);
80+
}
81+
};
82+
83+
// Call mode function with validator
84+
const modeFn = mode({ validateColumn });
85+
86+
// Check that it throws an error for non-existent column
87+
expect(() => modeFn(df._frame, 'nonexistent')).toThrow(
88+
"Column 'nonexistent' not found",
89+
);
90+
});
91+
92+
it('should handle empty frames', () => {
93+
// Create an empty DataFrame
94+
const emptyDf = DataFrame.create([]);
95+
96+
// Add an empty column
97+
emptyDf._frame.columns.value = [];
98+
99+
// Call mode function directly
100+
const modeFn = mode({ validateColumn: () => {} });
101+
const result = modeFn(emptyDf._frame, 'value');
102+
103+
// Check that the result is null for empty column
104+
expect(result).toBe(null);
105+
});
106+
});

0 commit comments

Comments
 (0)