diff --git a/games/wordle/master.py b/games/wordle/master.py index 53ac2211942c284ed7517f6dc6952072129c9db4..4a8c94be1e0217c9d749a93e299d470cf274e5a1 100644 --- a/games/wordle/master.py +++ b/games/wordle/master.py @@ -2,7 +2,7 @@ from typing import List, Tuple, Dict import numpy as np from backends import Model, HumanModel -from clemgame.clemgame import GameMaster, GameBenchmark +from clemgame.clemgame import GameMaster, GameBenchmark, GameScorer from clemgame import get_logger import clemgame.metrics as metrics from games.wordle.game import WordleGame @@ -19,8 +19,6 @@ class WordleGameMaster(GameMaster): self.config = experiment self.player_model_names = [player_model.get_name() for player_model in player_models] - self.cm = ComputeMetrics() - def setup(self, game_id, target_word, target_word_clue, target_word_difficulty): self.game_id = game_id @@ -475,6 +473,65 @@ class WordleGameMaster(GameMaster): f"game_result = {self.game_final_status}", data_for_computation ) + +class WordleGameScorer(GameScorer): + + def __init__(self, experiment: Dict, game_instance: Dict): + super().__init__(GAME_NAME, experiment, game_instance) + self.cm = ComputeMetrics() + + def compute_scores(self, episode_interactions: Dict) -> None: + for key, val in episode_interactions.items(): + if key == "turns": + # Look for last turn data and in that 'action' key + if ( + val + and val[-1] + and "action" in val[-1][-1] + and "data_for_computation" in val[-1][-1]["action"] + ): + data_to_compute_scores = val[-1][-1]["action"][ + "data_for_computation" + ] + if data_to_compute_scores: + aborted, loss = self._compute_game_status( + data_to_compute_scores["game_final_status"] + ) + self._compute_req_count( + data_to_compute_scores["guesser_req_count"], + data_to_compute_scores["critic_req_count"], + data_to_compute_scores["guesser_parsed_req_count"], + data_to_compute_scores["critic_parsed_req_count"], + data_to_compute_scores["turns_req_count"], + data_to_compute_scores["turns_parse_count"], + ) + self._compute_game_specific_metrics( + aborted, + loss, + data_to_compute_scores["turns_guess_feedback"], + data_to_compute_scores["use_critic"], + data_to_compute_scores["critic_guesses_change"], + data_to_compute_scores["target_word_difficulty"], + ) + return + + def _compute_game_status(self, status): + aborted = 0 + loss = 0 + success = 0 + + if status == "ABORTED": + aborted = 1 + elif status == "LOSS": + loss = 1 + else: + success = 1 + + self.log_episode_score(metrics.METRIC_ABORTED, aborted) + self.log_episode_score(metrics.METRIC_LOSE, loss) + self.log_episode_score(metrics.METRIC_SUCCESS, success) + return aborted, loss + def _compute_req_count( self, guesser_req_count, @@ -530,23 +587,6 @@ class WordleGameMaster(GameMaster): for idx, score in enumerate(turns_violate_count): self.log_turn_score(idx + 1, "Violated Request Count", score) - def _compute_game_status(self, status): - aborted = 0 - loss = 0 - success = 0 - - if status == "ABORTED": - aborted = 1 - elif status == "LOSS": - loss = 1 - else: - success = 1 - - self.log_episode_score(metrics.METRIC_ABORTED, aborted) - self.log_episode_score(metrics.METRIC_LOSE, loss) - self.log_episode_score(metrics.METRIC_SUCCESS, success) - return aborted, loss - def _compute_game_specific_metrics( self, aborted, @@ -610,7 +650,7 @@ class WordleGameMaster(GameMaster): self.log_episode_score(metrics.BENCH_SCORE, speed) self.log_episode_score("repeats guess", repeats_guess) self.log_episode_score("total guess repetitions", num_guess_repeats) - self.log_key("Target Word Difficulty", target_word_difficulty) + # self.log_key("Target Word Difficulty", target_word_difficulty) todo scoring should not change the interaction for idx, score in enumerate(turn_score): self.log_turn_score(idx + 1, "closeness score", score) @@ -665,41 +705,6 @@ class WordleGameMaster(GameMaster): "Non-Repetition-Guesser-On-Critic-Disagreement", 0 ) - def compute_scores(self, episode_interactions: Dict) -> None: - for key, val in episode_interactions.items(): - if key == "turns": - # Look for last turn data and in that 'action' key - if ( - val - and val[-1] - and "action" in val[-1][-1] - and "data_for_computation" in val[-1][-1]["action"] - ): - data_to_compute_scores = val[-1][-1]["action"][ - "data_for_computation" - ] - if data_to_compute_scores: - aborted, loss = self._compute_game_status( - data_to_compute_scores["game_final_status"] - ) - self._compute_req_count( - data_to_compute_scores["guesser_req_count"], - data_to_compute_scores["critic_req_count"], - data_to_compute_scores["guesser_parsed_req_count"], - data_to_compute_scores["critic_parsed_req_count"], - data_to_compute_scores["turns_req_count"], - data_to_compute_scores["turns_parse_count"], - ) - self._compute_game_specific_metrics( - aborted, - loss, - data_to_compute_scores["turns_guess_feedback"], - data_to_compute_scores["use_critic"], - data_to_compute_scores["critic_guesses_change"], - data_to_compute_scores["target_word_difficulty"], - ) - return - class WordleGameBenchmark(GameBenchmark): def __init__(self): @@ -711,6 +716,9 @@ class WordleGameBenchmark(GameBenchmark): def create_game_master(self, experiment: Dict, player_models: List[Model]) -> GameMaster: return WordleGameMaster(self.name, experiment, player_models) + def create_game_scorer(self, experiment: Dict, game_instance: Dict) -> GameScorer: + return WordleGameScorer(experiment, game_instance) + def is_single_player(self) -> bool: return True diff --git a/games/wordle_withclue/master.py b/games/wordle_withclue/master.py index 5c3a746090234cb572351525df125e12a0447510..aaa40748a108cca5a07dc2d33295b8b94ef27708 100644 --- a/games/wordle_withclue/master.py +++ b/games/wordle_withclue/master.py @@ -1,8 +1,8 @@ from typing import Dict, List from backends import Model -from clemgame.clemgame import GameBenchmark, GameMaster -from games.wordle.master import WordleGameMaster +from clemgame.clemgame import GameBenchmark, GameMaster, GameScorer +from games.wordle.master import WordleGameMaster, WordleGameScorer # this will resolve into subdirectories to find the instances GAME_NAME = "wordle_withclue" @@ -18,5 +18,8 @@ class WordleWithClueGameBenchmark(GameBenchmark): def create_game_master(self, experiment: Dict, player_models: List[Model]) -> GameMaster: return WordleGameMaster(self.name, experiment, player_models) + def create_game_scorer(self, experiment: Dict, game_instance: Dict) -> GameScorer: + return WordleGameScorer(experiment, game_instance) + def is_single_player(self) -> bool: return True diff --git a/games/wordle_withcritic/master.py b/games/wordle_withcritic/master.py index 79632d3939e5ce9cd4326a280b536b5722936644..4d5cd6c0c4ab20aaa3c1690982be25aada50ce5f 100644 --- a/games/wordle_withcritic/master.py +++ b/games/wordle_withcritic/master.py @@ -1,8 +1,8 @@ from typing import Dict, List from backends import Model -from clemgame.clemgame import GameBenchmark, GameMaster -from games.wordle.master import WordleGameMaster +from clemgame.clemgame import GameBenchmark, GameMaster, GameScorer +from games.wordle.master import WordleGameMaster, WordleGameScorer GAME_NAME = "wordle_withcritic" @@ -16,3 +16,6 @@ class WordleWithClueAndCriticGameBenchmark(GameBenchmark): def create_game_master(self, experiment: Dict, player_models: List[Model]) -> GameMaster: return WordleGameMaster(self.name, experiment, player_models) + + def create_game_scorer(self, experiment: Dict, game_instance: Dict) -> GameScorer: + return WordleGameScorer(experiment, game_instance) \ No newline at end of file