Skip to content

Commit fbd17a6

Browse files
committed
Remove Preference support from constants; Add unit tests for PREFERENCE_OPENAI
1 parent 5320437 commit fbd17a6

File tree

2 files changed

+181
-31
lines changed

2 files changed

+181
-31
lines changed

src/together/constants.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,15 +39,13 @@ class DatasetFormat(enum.Enum):
3939
GENERAL = "general"
4040
CONVERSATION = "conversation"
4141
INSTRUCTION = "instruction"
42-
PREFERENCE = "preference"
4342
PREFERENCE_OPENAI = "preference_openai"
4443

4544

4645
JSONL_REQUIRED_COLUMNS_MAP = {
4746
DatasetFormat.GENERAL: ["text"],
4847
DatasetFormat.CONVERSATION: ["messages"],
4948
DatasetFormat.INSTRUCTION: ["prompt", "completion"],
50-
DatasetFormat.PREFERENCE: ["chosen", "rejected"],
5149
DatasetFormat.PREFERENCE_OPENAI: [
5250
"input",
5351
"preferred_output",

tests/unit/test_files_checks.py

Lines changed: 181 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,54 @@
55
from together.constants import MIN_SAMPLES
66
from together.utils.files import check_file
77

8+
_TEST_PREFERENCE_OPENAI_CONTENT = [
9+
{
10+
"input": {
11+
"messages": [
12+
{"role": "user", "content": "Hi there, I have a question."},
13+
{"role": "assistant", "content": "Hello, how is your day going?"},
14+
{
15+
"role": "user",
16+
"content": "Hello, can you tell me how cold San Francisco is today?",
17+
},
18+
],
19+
},
20+
"preferred_output": [
21+
{
22+
"role": "assistant",
23+
"content": "Today in San Francisco, it is not quite cold as expected. Morning clouds will give away "
24+
"to sunshine, with a high near 68°F (20°C) and a low around 57°F (14°C).",
25+
}
26+
],
27+
"non_preferred_output": [
28+
{
29+
"role": "assistant",
30+
"content": "It is not particularly cold in San Francisco today.",
31+
}
32+
],
33+
},
34+
{
35+
"input": {
36+
"messages": [
37+
{
38+
"role": "user",
39+
"content": "What's the best way to learn programming?",
40+
},
41+
],
42+
},
43+
"preferred_output": [
44+
{
45+
"role": "assistant",
46+
"content": "The best way to learn programming is through consistent practice, working on real projects, "
47+
"and breaking down complex problems into smaller parts. Start with a beginner-friendly language like Python.",
48+
}
49+
],
50+
"non_preferred_output": [
51+
{"role": "assistant", "content": "Just read some books and you'll be fine."}
52+
],
53+
},
54+
]
55+
856

957
def test_check_jsonl_valid_general(tmp_path: Path):
1058
# Create a valid JSONL file
@@ -80,45 +128,149 @@ def test_check_jsonl_valid_conversational_single_turn(tmp_path: Path):
80128
def test_check_jsonl_valid_conversational_multiple_turns(tmp_path: Path):
81129
# Create a valid JSONL file with conversational format and multiple user-assistant turn pairs
82130
file = tmp_path / "valid_conversational_multiple_turns.jsonl"
83-
content = [
131+
content = _TEST_PREFERENCE_OPENAI_CONTENT
132+
with file.open("w") as f:
133+
f.write("\n".join(json.dumps(item) for item in content))
134+
135+
report = check_file(file)
136+
137+
assert report["is_check_passed"]
138+
assert report["utf8"]
139+
assert report["num_samples"] == len(content)
140+
assert report["has_min_samples"]
141+
142+
143+
def test_check_jsonl_valid_preference_openai(tmp_path: Path):
144+
file = tmp_path / "valid_preference_openai.jsonl"
145+
content = _TEST_PREFERENCE_OPENAI_CONTENT
146+
with file.open("w") as f:
147+
f.write("\n".join(json.dumps(item) for item in content))
148+
149+
report = check_file(file)
150+
151+
assert report["is_check_passed"]
152+
assert report["utf8"]
153+
assert report["num_samples"] == len(content)
154+
assert report["has_min_samples"]
155+
156+
157+
def test_check_jsonl_invalid_preference_openai_missing_fields(tmp_path: Path):
158+
# Test all required fields in OpenAI preference format
159+
required_fields = [
160+
("input", "Missing input field"),
161+
("preferred_output", "Missing preferred_output field"),
162+
("non_preferred_output", "Missing non_preferred_output field"),
163+
]
164+
165+
for field_to_remove, description in required_fields:
166+
file = tmp_path / f"invalid_preference_openai_missing_{field_to_remove}.jsonl"
167+
content = [item.copy() for item in _TEST_PREFERENCE_OPENAI_CONTENT]
168+
169+
# Remove the specified field from the first item
170+
del content[0][field_to_remove]
171+
172+
with file.open("w") as f:
173+
f.write("\n".join(json.dumps(item) for item in content))
174+
175+
report = check_file(file)
176+
177+
assert not report["is_check_passed"], f"Test should fail when {description}"
178+
179+
180+
def test_check_jsonl_invalid_preference_openai_structural_issues(tmp_path: Path):
181+
# Test various structural issues in OpenAI preference format
182+
test_cases = [
84183
{
85-
"messages": [
86-
{"role": "user", "content": "Is it going to rain today?"},
184+
"name": "empty_messages",
185+
"modifier": lambda item: item.update({"input": {"messages": []}}),
186+
"description": "Empty messages array",
187+
},
188+
{
189+
"name": "missing_role_preferred",
190+
"modifier": lambda item: item.update(
191+
{"preferred_output": [{"content": "Missing role field"}]}
192+
),
193+
"description": "Missing role in preferred_output",
194+
},
195+
{
196+
"name": "missing_role_non_preferred",
197+
"modifier": lambda item: item.update(
198+
{"non_preferred_output": [{"content": "Missing role field"}]}
199+
),
200+
"description": "Missing role in non_preferred_output",
201+
},
202+
{
203+
"name": "wrong_output_format_preferred",
204+
"modifier": lambda item: item.update(
205+
{"preferred_output": "Not an array but a string"}
206+
),
207+
"description": "Wrong format for preferred_output",
208+
},
209+
{
210+
"name": "wrong_output_format_non_preferred",
211+
"modifier": lambda item: item.update(
212+
{"non_preferred_output": "Not an array but a string"}
213+
),
214+
"description": "Wrong format for non_preferred_output",
215+
},
216+
{
217+
"name": "missing_content",
218+
"modifier": lambda item: item.update(
219+
{"input": {"messages": [{"role": "user"}]}}
220+
),
221+
"description": "Missing content in messages",
222+
},
223+
{
224+
"name": "multiple_preferred_outputs",
225+
"modifier": lambda item: item.update(
87226
{
88-
"role": "assistant",
89-
"content": "Yes, expect showers in the afternoon.",
90-
},
91-
{"role": "user", "content": "What is the weather like in Tokyo?"},
92-
{"role": "assistant", "content": "It is sunny with a chance of rain."},
93-
]
227+
"preferred_output": [
228+
{"role": "assistant", "content": "First response"},
229+
{"role": "assistant", "content": "Second response"},
230+
]
231+
}
232+
),
233+
"description": "Multiple messages in preferred_output",
94234
},
95235
{
96-
"messages": [
97-
{"role": "user", "content": "Who won the game last night?"},
98-
{"role": "assistant", "content": "The home team won by two points."},
99-
{"role": "user", "content": "What is the weather like in Amsterdam?"},
100-
{"role": "assistant", "content": "It is cloudy with a chance of snow."},
101-
]
236+
"name": "multiple_non_preferred_outputs",
237+
"modifier": lambda item: item.update(
238+
{
239+
"non_preferred_output": [
240+
{"role": "assistant", "content": "First response"},
241+
{"role": "assistant", "content": "Second response"},
242+
]
243+
}
244+
),
245+
"description": "Multiple messages in non_preferred_output",
102246
},
103247
{
104-
"messages": [
105-
{"role": "system", "content": "You are a kind AI"},
106-
{"role": "user", "content": "Who won the game last night?"},
107-
{"role": "assistant", "content": "The home team won by two points."},
108-
{"role": "user", "content": "What is the weather like in Amsterdam?"},
109-
{"role": "assistant", "content": "It is cloudy with a chance of snow."},
110-
]
248+
"name": "empty_preferred_output",
249+
"modifier": lambda item: item.update({"preferred_output": []}),
250+
"description": "Empty preferred_output array",
251+
},
252+
{
253+
"name": "empty_non_preferred_output",
254+
"modifier": lambda item: item.update({"non_preferred_output": []}),
255+
"description": "Empty non_preferred_output array",
111256
},
112257
]
113-
with file.open("w") as f:
114-
f.write("\n".join(json.dumps(item) for item in content))
115258

116-
report = check_file(file)
259+
for test_case in test_cases:
260+
file = tmp_path / f"invalid_preference_openai_{test_case['name']}.jsonl"
261+
content = [item.copy() for item in _TEST_PREFERENCE_OPENAI_CONTENT]
117262

118-
assert report["is_check_passed"]
119-
assert report["utf8"]
120-
assert report["num_samples"] == len(content)
121-
assert report["has_min_samples"]
263+
# Apply the modification to the first item
264+
test_case["modifier"](content[0])
265+
266+
with file.open("w") as f:
267+
f.write("\n".join(json.dumps(item) for item in content))
268+
269+
report = check_file(file)
270+
271+
assert not report[
272+
"is_check_passed"
273+
], f"Test should fail with {test_case['description']}"
122274

123275

124276
def test_check_jsonl_empty_file(tmp_path: Path):

0 commit comments

Comments
 (0)