318 lines
14 KiB
Python
318 lines
14 KiB
Python
import unittest
|
|
|
|
from open_r1.rewards import (
|
|
accuracy_reward,
|
|
format_reward,
|
|
get_cosine_scaled_reward,
|
|
get_repetition_penalty_reward,
|
|
len_reward,
|
|
reasoning_steps_reward,
|
|
)
|
|
|
|
|
|
class TestRewards(unittest.TestCase):
|
|
def test_accuracy_reward_correct_answer(self):
|
|
"""Test accuracy_reward with a correct answer."""
|
|
completion = [[{"content": r"\boxed{\frac{63}{400}}"}]]
|
|
solution = [r"\frac{63}{400}"]
|
|
|
|
rewards = accuracy_reward(completion, solution)
|
|
self.assertEqual(rewards[0], 1.0)
|
|
|
|
def test_accuracy_reward_wrong_answer(self):
|
|
"""Test accuracy_reward with an incorrect answer."""
|
|
completion = [[{"content": r"\boxed{\frac{64}{400}}"}]]
|
|
solution = [r"\frac{63}{400}"]
|
|
|
|
rewards = accuracy_reward(completion, solution)
|
|
self.assertEqual(rewards[0], 0.0)
|
|
|
|
def test_format_reward_correct(self):
|
|
"""Test format_reward with correct format."""
|
|
completion = [[{"content": "<think>Some reasoning</think><answer>The answer</answer>"}]]
|
|
rewards = format_reward(completion)
|
|
self.assertEqual(rewards[0], 1.0)
|
|
|
|
def test_format_reward_incorrect(self):
|
|
"""Test format_reward with incorrect format."""
|
|
incorrect_formats = [
|
|
"<think>Only thinking</think>",
|
|
"<answer>Only answer</answer>",
|
|
"No tags at all",
|
|
"<think>Missing closing</think><answer>Missing closing",
|
|
"<think>Wrong order</answer><answer>Wrong order</think>",
|
|
]
|
|
|
|
for fmt in incorrect_formats:
|
|
completion = [[{"content": fmt}]]
|
|
rewards = format_reward(completion)
|
|
self.assertEqual(rewards[0], 0.0)
|
|
|
|
def test_reasoning_steps_reward(self):
|
|
"""Test reasoning_steps_reward with various formats."""
|
|
test_cases = [
|
|
# Full credit cases (3 or more steps)
|
|
("Step 1: First step\nStep 2: Second step\nStep 3: Third step", 1.0),
|
|
("First, we do this.\nSecond, we do that.\nFinally, we conclude.", 1.0),
|
|
# Partial credit cases (less than 3 steps)
|
|
("Step 1: Only step", 1 / 3),
|
|
("First, we do this.\nFinally, we conclude.", 2 / 3),
|
|
# No credit case
|
|
("Just plain text without any clear steps", 0.0),
|
|
]
|
|
|
|
for content, expected_reward in test_cases:
|
|
completion = [[{"content": content}]]
|
|
rewards = reasoning_steps_reward(completion)
|
|
self.assertAlmostEqual(rewards[0], expected_reward)
|
|
|
|
def test_multiple_completions(self):
|
|
"""Test handling multiple completions at once."""
|
|
completions = [[{"content": r"\boxed{\frac{63}{400}}"}], [{"content": r"\boxed{\frac{64}{400}}"}]]
|
|
solutions = [r"\frac{63}{400}", r"\frac{63}{400}"]
|
|
|
|
rewards = accuracy_reward(completions, solutions)
|
|
self.assertEqual(len(rewards), 2)
|
|
self.assertEqual(rewards[0], 1.0)
|
|
self.assertEqual(rewards[1], 0.0)
|
|
|
|
def test_cosine_scaled_reward(self):
|
|
"""Test cosine_scaled_reward with various cases."""
|
|
# Test parameters
|
|
test_params = {
|
|
"min_value_wrong": -1.0,
|
|
"max_value_wrong": -0.5,
|
|
"min_value_correct": 0.5,
|
|
"max_value_correct": 1.0,
|
|
"max_len": 100,
|
|
}
|
|
|
|
test_cases = [
|
|
# Correct answers with different lengths
|
|
(r"\boxed{\frac{63}{400}}", r"\frac{63}{400}", 20, 0.943), # Short correct answer
|
|
(r"\boxed{\frac{63}{400}}", r"\frac{63}{400}", 80, 0.547), # Long correct answer
|
|
# Wrong answers with different lengths
|
|
(r"\boxed{\frac{64}{400}}", r"\frac{63}{400}", 20, -0.942), # Short wrong answer
|
|
(r"\boxed{\frac{64}{400}}", r"\frac{63}{400}", 80, -0.547), # Long wrong answer
|
|
]
|
|
|
|
for content, solution, content_len, expected_reward in test_cases:
|
|
# Pad content to desired length
|
|
padded_content = content + " " * (content_len - len(content))
|
|
completion = [[{"content": padded_content}]]
|
|
|
|
rewards = get_cosine_scaled_reward(**test_params)(completion, [solution])
|
|
self.assertAlmostEqual(rewards[0], expected_reward, places=2)
|
|
|
|
def test_format_reward_specific_multiline(self):
|
|
"""Test format_reward with a specific multiline input."""
|
|
inputs = "<think>\nI will count each distinct object in the image:\n1. Purple scooter\n2. Red bicycle\n3. Green motorcycle\n4. Gray sedan\n5. Yellow school bus\n6. Small green double-decker bus\n7. Small red car\n8. Small purple car\n9. Small gray dirt bike\n\nThere are 9 distinct objects in total.\n</think>\n<answer>9</answer>"
|
|
completion = [[{"content": inputs}]]
|
|
rewards = format_reward(completion)
|
|
self.assertEqual(rewards[0], 1.0)
|
|
|
|
def test_same_length_responses(self):
|
|
"""Test len_reward when all responses have the same length."""
|
|
completions = [[{"content": r"\boxed{\frac{63}{400}}"}], [{"content": r"\boxed{\frac{64}{400}}"}]]
|
|
solutions = [r"\frac{63}{400}", r"\frac{63}{400}"]
|
|
|
|
rewards = len_reward(completions, solutions)
|
|
self.assertEqual(rewards, [0.0, 0.0])
|
|
|
|
def test_different_lengths_correct_answers(self):
|
|
"""Test len_reward with different length correct answers."""
|
|
completions = [
|
|
[{"content": r"\boxed{\frac{63}{400}}"}], # shorter
|
|
[{"content": r"\boxed{\frac{63}{400}} " + "x" * 10}], # longer
|
|
]
|
|
solutions = [r"\frac{63}{400}", r"\frac{63}{400}"]
|
|
|
|
rewards = len_reward(completions, solutions)
|
|
self.assertGreater(rewards[0], rewards[1]) # shorter answer should get higher reward
|
|
self.assertAlmostEqual(rewards[0], 0.5) # shortest correct answer gets maximum reward
|
|
|
|
def test_different_lengths_incorrect_answers(self):
|
|
"""Test len_reward with different length incorrect answers."""
|
|
completions = [
|
|
[{"content": r"\boxed{\frac{64}{400}}"}], # shorter
|
|
[{"content": r"\boxed{\frac{64}{400}} " + "x" * 10}], # longer
|
|
]
|
|
solutions = [r"\frac{63}{400}", r"\frac{63}{400}"]
|
|
|
|
rewards = len_reward(completions, solutions)
|
|
self.assertLessEqual(rewards[0], 0.0) # incorrect answers should get non-positive rewards
|
|
self.assertLessEqual(rewards[1], 0.0)
|
|
self.assertGreater(rewards[0], rewards[1]) # shorter answer should still be penalized less
|
|
|
|
def test_mixed_correctness(self):
|
|
"""Test len_reward with mix of correct and incorrect answers of different lengths."""
|
|
completions = [
|
|
[{"content": r"\boxed{\frac{63}{400}}"}], # correct, shorter
|
|
[{"content": r"\boxed{\frac{63}{400}} " + "x" * 10}], # correct, longer
|
|
[{"content": r"\boxed{\frac{64}{400}}"}], # incorrect, shorter
|
|
[{"content": r"\boxed{\frac{64}{400}} " + "x" * 10}], # incorrect, longer
|
|
]
|
|
solutions = [r"\frac{63}{400}"] * 4
|
|
|
|
rewards = len_reward(completions, solutions)
|
|
|
|
# Shortest correct answer should get positive reward
|
|
self.assertGreater(rewards[0], 0.0)
|
|
|
|
# Longer correct answer might get negative reward:
|
|
self.assertGreater(rewards[2], rewards[1])
|
|
self.assertGreaterEqual(rewards[1], rewards[3])
|
|
|
|
# Incorrect answers should get non-positive rewards
|
|
self.assertLessEqual(rewards[2], 0.0)
|
|
self.assertLessEqual(rewards[3], 0.0)
|
|
|
|
# Shorter answers should get better rewards within their correctness category
|
|
self.assertGreater(rewards[0], rewards[1]) # correct answers
|
|
self.assertGreater(rewards[2], rewards[3]) # incorrect answers
|
|
|
|
def test_unparseable_solution(self):
|
|
"""Test len_reward with unparseable solution."""
|
|
completions = [[{"content": r"\boxed{answer}"}], [{"content": r"\boxed{answer} " + "x" * 10}]]
|
|
solutions = ["unparseable_latex", "unparseable_latex"]
|
|
|
|
rewards = len_reward(completions, solutions)
|
|
self.assertGreater(rewards[0], rewards[1]) # shorter answer should still get better reward
|
|
self.assertAlmostEqual(rewards[0], 0.5) # treated as correct, shortest gets maximum reward
|
|
|
|
|
|
class TestRepetitionPenaltyReward(unittest.TestCase):
|
|
def test_positive_max_penalty_raises_value_error(self):
|
|
with self.assertRaises(ValueError):
|
|
get_repetition_penalty_reward(ngram_size=2, max_penalty=1.0)
|
|
with self.assertRaisesRegex(ValueError, "max_penalty 1.5 should not be positive"):
|
|
get_repetition_penalty_reward(ngram_size=2, max_penalty=1.5)
|
|
|
|
def test_no_repetition(self):
|
|
reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
|
|
completions = [[{"content": "this is a test sentence"}]]
|
|
rewards = reward_fn(completions)
|
|
self.assertEqual(rewards, [0.0])
|
|
|
|
def test_full_repetition(self):
|
|
reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
|
|
completions = [[{"content": "this this this this this"}]]
|
|
|
|
rewards = reward_fn(completions)
|
|
# (1 - 1/4) * -1 = -0.75
|
|
self.assertEqual(rewards, [-0.75])
|
|
|
|
def test_partial_repetition(self):
|
|
reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
|
|
completions = [[{"content": "this is a this is a test"}]]
|
|
|
|
rewards = reward_fn(completions)
|
|
# Unique 2-grams: (this, is), (is, a), (a, this), (a, test). 4 unique out of 6 total
|
|
# (1 - 4/6) * -1 = -1/3 = -0.3333...
|
|
self.assertAlmostEqual(rewards[0], -1 / 3)
|
|
|
|
def test_multiple_completions(self):
|
|
reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-0.5)
|
|
completions = [
|
|
[{"content": "this is a test"}],
|
|
[{"content": "test test test test"}],
|
|
]
|
|
|
|
rewards = reward_fn(completions)
|
|
# Completion 1: (this, is, a), (is, a, test) -> 2 unique / 2 total -> (1 - 2/2) * -0.5 = 0
|
|
# Completion 2: (test, test, test) -> 1 unique / 2 total -> (1 - 1/2) * -0.5 = -0.25
|
|
self.assertAlmostEqual(rewards[0], 0.0)
|
|
self.assertAlmostEqual(rewards[1], -0.25)
|
|
|
|
def test_empty_completion(self):
|
|
reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
|
|
completions = [[{"content": ""}]]
|
|
rewards = reward_fn(completions)
|
|
self.assertEqual(rewards, [0.0])
|
|
|
|
def test_different_ngram_size(self):
|
|
reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-2.0)
|
|
completions = [[{"content": "this is a this is a test"}]]
|
|
|
|
rewards = reward_fn(completions)
|
|
self.assertAlmostEqual(rewards[0], -0.4)
|
|
|
|
def test_mixed_case(self):
|
|
reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
|
|
completions = [
|
|
[{"content": "This is A Test"}],
|
|
[{"content": "this IS a test"}],
|
|
]
|
|
|
|
rewards = reward_fn(completions)
|
|
# both completions should produce the same reward, because the text gets lowercased
|
|
self.assertAlmostEqual(rewards[0], rewards[1])
|
|
|
|
def test_one_word_completion(self):
|
|
reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
|
|
completions = [[{"content": "word"}]]
|
|
|
|
rewards = reward_fn(completions)
|
|
self.assertEqual(rewards, [0.0])
|
|
|
|
def test_two_word_completion(self):
|
|
reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
|
|
completions = [[{"content": "two words"}]]
|
|
|
|
rewards = reward_fn(completions)
|
|
self.assertEqual(rewards, [0.0])
|
|
|
|
def test_three_word_completion(self):
|
|
reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
|
|
completions = [[{"content": "three different words"}]]
|
|
|
|
rewards = reward_fn(completions)
|
|
self.assertEqual(rewards, [0.0])
|
|
|
|
def test_three_word_repetition_completion(self):
|
|
reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
|
|
completions = [[{"content": "word word word word"}]]
|
|
|
|
rewards = reward_fn(completions)
|
|
self.assertEqual(rewards, [-0.5])
|
|
|
|
def test_four_word_completion_with_repetition(self):
|
|
reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
|
|
completions = [[{"content": "one two one two"}]]
|
|
|
|
rewards = reward_fn(completions)
|
|
# ngrams are (one two one) (two one two). unique is 2 and count is 2, therefore (1-1) * -1.
|
|
self.assertEqual(rewards, [0.0])
|
|
|
|
def test_five_word_completion_with_repetition(self):
|
|
reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-0.5)
|
|
completions = [[{"content": "A B C A B"}]]
|
|
|
|
rewards = reward_fn(completions)
|
|
# (A B C) (B C A) (C A B). unique is 3. count is 3 (1-1) * -.5 = 0
|
|
self.assertEqual(rewards, [0.0])
|
|
|
|
def test_six_word_completion_with_repetition(self):
|
|
reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
|
|
completions = [[{"content": "A B C A B C"}]]
|
|
|
|
rewards = reward_fn(completions)
|
|
self.assertEqual(rewards, [-0.25])
|
|
|
|
def test_long_completion_with_repetition(self):
|
|
reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
|
|
completions = [[{"content": "A B C A B C E F G A B C A B C"}]]
|
|
rewards = reward_fn(completions)
|
|
self.assertAlmostEqual(rewards[0], -0.3846, places=4)
|
|
|
|
def test_long_completion_without_repetition(self):
|
|
reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
|
|
completions = [[{"content": "A B C D E F G H I J K L"}]]
|
|
|
|
rewards = reward_fn(completions)
|
|
self.assertEqual(rewards, [0.0])
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|