Kashif Rasul 90a6de94c7
Revert "Weighted reward functions (#213)" (#317)
This reverts commit fbea53267b9676fc89e92c9a24c83cb23e0884d0.
2025-02-13 15:00:05 +01:00

318 lines
14 KiB
Python

import unittest
from open_r1.rewards import (
accuracy_reward,
format_reward,
get_cosine_scaled_reward,
get_repetition_penalty_reward,
len_reward,
reasoning_steps_reward,
)
class TestRewards(unittest.TestCase):
def test_accuracy_reward_correct_answer(self):
"""Test accuracy_reward with a correct answer."""
completion = [[{"content": r"\boxed{\frac{63}{400}}"}]]
solution = [r"\frac{63}{400}"]
rewards = accuracy_reward(completion, solution)
self.assertEqual(rewards[0], 1.0)
def test_accuracy_reward_wrong_answer(self):
"""Test accuracy_reward with an incorrect answer."""
completion = [[{"content": r"\boxed{\frac{64}{400}}"}]]
solution = [r"\frac{63}{400}"]
rewards = accuracy_reward(completion, solution)
self.assertEqual(rewards[0], 0.0)
def test_format_reward_correct(self):
"""Test format_reward with correct format."""
completion = [[{"content": "<think>Some reasoning</think><answer>The answer</answer>"}]]
rewards = format_reward(completion)
self.assertEqual(rewards[0], 1.0)
def test_format_reward_incorrect(self):
"""Test format_reward with incorrect format."""
incorrect_formats = [
"<think>Only thinking</think>",
"<answer>Only answer</answer>",
"No tags at all",
"<think>Missing closing</think><answer>Missing closing",
"<think>Wrong order</answer><answer>Wrong order</think>",
]
for fmt in incorrect_formats:
completion = [[{"content": fmt}]]
rewards = format_reward(completion)
self.assertEqual(rewards[0], 0.0)
def test_reasoning_steps_reward(self):
"""Test reasoning_steps_reward with various formats."""
test_cases = [
# Full credit cases (3 or more steps)
("Step 1: First step\nStep 2: Second step\nStep 3: Third step", 1.0),
("First, we do this.\nSecond, we do that.\nFinally, we conclude.", 1.0),
# Partial credit cases (less than 3 steps)
("Step 1: Only step", 1 / 3),
("First, we do this.\nFinally, we conclude.", 2 / 3),
# No credit case
("Just plain text without any clear steps", 0.0),
]
for content, expected_reward in test_cases:
completion = [[{"content": content}]]
rewards = reasoning_steps_reward(completion)
self.assertAlmostEqual(rewards[0], expected_reward)
def test_multiple_completions(self):
"""Test handling multiple completions at once."""
completions = [[{"content": r"\boxed{\frac{63}{400}}"}], [{"content": r"\boxed{\frac{64}{400}}"}]]
solutions = [r"\frac{63}{400}", r"\frac{63}{400}"]
rewards = accuracy_reward(completions, solutions)
self.assertEqual(len(rewards), 2)
self.assertEqual(rewards[0], 1.0)
self.assertEqual(rewards[1], 0.0)
def test_cosine_scaled_reward(self):
"""Test cosine_scaled_reward with various cases."""
# Test parameters
test_params = {
"min_value_wrong": -1.0,
"max_value_wrong": -0.5,
"min_value_correct": 0.5,
"max_value_correct": 1.0,
"max_len": 100,
}
test_cases = [
# Correct answers with different lengths
(r"\boxed{\frac{63}{400}}", r"\frac{63}{400}", 20, 0.943), # Short correct answer
(r"\boxed{\frac{63}{400}}", r"\frac{63}{400}", 80, 0.547), # Long correct answer
# Wrong answers with different lengths
(r"\boxed{\frac{64}{400}}", r"\frac{63}{400}", 20, -0.942), # Short wrong answer
(r"\boxed{\frac{64}{400}}", r"\frac{63}{400}", 80, -0.547), # Long wrong answer
]
for content, solution, content_len, expected_reward in test_cases:
# Pad content to desired length
padded_content = content + " " * (content_len - len(content))
completion = [[{"content": padded_content}]]
rewards = get_cosine_scaled_reward(**test_params)(completion, [solution])
self.assertAlmostEqual(rewards[0], expected_reward, places=2)
def test_format_reward_specific_multiline(self):
"""Test format_reward with a specific multiline input."""
inputs = "<think>\nI will count each distinct object in the image:\n1. Purple scooter\n2. Red bicycle\n3. Green motorcycle\n4. Gray sedan\n5. Yellow school bus\n6. Small green double-decker bus\n7. Small red car\n8. Small purple car\n9. Small gray dirt bike\n\nThere are 9 distinct objects in total.\n</think>\n<answer>9</answer>"
completion = [[{"content": inputs}]]
rewards = format_reward(completion)
self.assertEqual(rewards[0], 1.0)
def test_same_length_responses(self):
"""Test len_reward when all responses have the same length."""
completions = [[{"content": r"\boxed{\frac{63}{400}}"}], [{"content": r"\boxed{\frac{64}{400}}"}]]
solutions = [r"\frac{63}{400}", r"\frac{63}{400}"]
rewards = len_reward(completions, solutions)
self.assertEqual(rewards, [0.0, 0.0])
def test_different_lengths_correct_answers(self):
"""Test len_reward with different length correct answers."""
completions = [
[{"content": r"\boxed{\frac{63}{400}}"}], # shorter
[{"content": r"\boxed{\frac{63}{400}} " + "x" * 10}], # longer
]
solutions = [r"\frac{63}{400}", r"\frac{63}{400}"]
rewards = len_reward(completions, solutions)
self.assertGreater(rewards[0], rewards[1]) # shorter answer should get higher reward
self.assertAlmostEqual(rewards[0], 0.5) # shortest correct answer gets maximum reward
def test_different_lengths_incorrect_answers(self):
"""Test len_reward with different length incorrect answers."""
completions = [
[{"content": r"\boxed{\frac{64}{400}}"}], # shorter
[{"content": r"\boxed{\frac{64}{400}} " + "x" * 10}], # longer
]
solutions = [r"\frac{63}{400}", r"\frac{63}{400}"]
rewards = len_reward(completions, solutions)
self.assertLessEqual(rewards[0], 0.0) # incorrect answers should get non-positive rewards
self.assertLessEqual(rewards[1], 0.0)
self.assertGreater(rewards[0], rewards[1]) # shorter answer should still be penalized less
def test_mixed_correctness(self):
"""Test len_reward with mix of correct and incorrect answers of different lengths."""
completions = [
[{"content": r"\boxed{\frac{63}{400}}"}], # correct, shorter
[{"content": r"\boxed{\frac{63}{400}} " + "x" * 10}], # correct, longer
[{"content": r"\boxed{\frac{64}{400}}"}], # incorrect, shorter
[{"content": r"\boxed{\frac{64}{400}} " + "x" * 10}], # incorrect, longer
]
solutions = [r"\frac{63}{400}"] * 4
rewards = len_reward(completions, solutions)
# Shortest correct answer should get positive reward
self.assertGreater(rewards[0], 0.0)
# Longer correct answer might get negative reward:
self.assertGreater(rewards[2], rewards[1])
self.assertGreaterEqual(rewards[1], rewards[3])
# Incorrect answers should get non-positive rewards
self.assertLessEqual(rewards[2], 0.0)
self.assertLessEqual(rewards[3], 0.0)
# Shorter answers should get better rewards within their correctness category
self.assertGreater(rewards[0], rewards[1]) # correct answers
self.assertGreater(rewards[2], rewards[3]) # incorrect answers
def test_unparseable_solution(self):
"""Test len_reward with unparseable solution."""
completions = [[{"content": r"\boxed{answer}"}], [{"content": r"\boxed{answer} " + "x" * 10}]]
solutions = ["unparseable_latex", "unparseable_latex"]
rewards = len_reward(completions, solutions)
self.assertGreater(rewards[0], rewards[1]) # shorter answer should still get better reward
self.assertAlmostEqual(rewards[0], 0.5) # treated as correct, shortest gets maximum reward
class TestRepetitionPenaltyReward(unittest.TestCase):
def test_positive_max_penalty_raises_value_error(self):
with self.assertRaises(ValueError):
get_repetition_penalty_reward(ngram_size=2, max_penalty=1.0)
with self.assertRaisesRegex(ValueError, "max_penalty 1.5 should not be positive"):
get_repetition_penalty_reward(ngram_size=2, max_penalty=1.5)
def test_no_repetition(self):
reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
completions = [[{"content": "this is a test sentence"}]]
rewards = reward_fn(completions)
self.assertEqual(rewards, [0.0])
def test_full_repetition(self):
reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
completions = [[{"content": "this this this this this"}]]
rewards = reward_fn(completions)
# (1 - 1/4) * -1 = -0.75
self.assertEqual(rewards, [-0.75])
def test_partial_repetition(self):
reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
completions = [[{"content": "this is a this is a test"}]]
rewards = reward_fn(completions)
# Unique 2-grams: (this, is), (is, a), (a, this), (a, test). 4 unique out of 6 total
# (1 - 4/6) * -1 = -1/3 = -0.3333...
self.assertAlmostEqual(rewards[0], -1 / 3)
def test_multiple_completions(self):
reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-0.5)
completions = [
[{"content": "this is a test"}],
[{"content": "test test test test"}],
]
rewards = reward_fn(completions)
# Completion 1: (this, is, a), (is, a, test) -> 2 unique / 2 total -> (1 - 2/2) * -0.5 = 0
# Completion 2: (test, test, test) -> 1 unique / 2 total -> (1 - 1/2) * -0.5 = -0.25
self.assertAlmostEqual(rewards[0], 0.0)
self.assertAlmostEqual(rewards[1], -0.25)
def test_empty_completion(self):
reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
completions = [[{"content": ""}]]
rewards = reward_fn(completions)
self.assertEqual(rewards, [0.0])
def test_different_ngram_size(self):
reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-2.0)
completions = [[{"content": "this is a this is a test"}]]
rewards = reward_fn(completions)
self.assertAlmostEqual(rewards[0], -0.4)
def test_mixed_case(self):
reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
completions = [
[{"content": "This is A Test"}],
[{"content": "this IS a test"}],
]
rewards = reward_fn(completions)
# both completions should produce the same reward, because the text gets lowercased
self.assertAlmostEqual(rewards[0], rewards[1])
def test_one_word_completion(self):
reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
completions = [[{"content": "word"}]]
rewards = reward_fn(completions)
self.assertEqual(rewards, [0.0])
def test_two_word_completion(self):
reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
completions = [[{"content": "two words"}]]
rewards = reward_fn(completions)
self.assertEqual(rewards, [0.0])
def test_three_word_completion(self):
reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
completions = [[{"content": "three different words"}]]
rewards = reward_fn(completions)
self.assertEqual(rewards, [0.0])
def test_three_word_repetition_completion(self):
reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
completions = [[{"content": "word word word word"}]]
rewards = reward_fn(completions)
self.assertEqual(rewards, [-0.5])
def test_four_word_completion_with_repetition(self):
reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
completions = [[{"content": "one two one two"}]]
rewards = reward_fn(completions)
# ngrams are (one two one) (two one two). unique is 2 and count is 2, therefore (1-1) * -1.
self.assertEqual(rewards, [0.0])
def test_five_word_completion_with_repetition(self):
reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-0.5)
completions = [[{"content": "A B C A B"}]]
rewards = reward_fn(completions)
# (A B C) (B C A) (C A B). unique is 3. count is 3 (1-1) * -.5 = 0
self.assertEqual(rewards, [0.0])
def test_six_word_completion_with_repetition(self):
reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
completions = [[{"content": "A B C A B C"}]]
rewards = reward_fn(completions)
self.assertEqual(rewards, [-0.25])
def test_long_completion_with_repetition(self):
reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
completions = [[{"content": "A B C A B C E F G A B C A B C"}]]
rewards = reward_fn(completions)
self.assertAlmostEqual(rewards[0], -0.3846, places=4)
def test_long_completion_without_repetition(self):
reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
completions = [[{"content": "A B C D E F G H I J K L"}]]
rewards = reward_fn(completions)
self.assertEqual(rewards, [0.0])
if __name__ == "__main__":
unittest.main()