Feat/refactor model pairing (#35)

Fixes #34 - now clem automatically assumes self-play when only a single model is given (and the game is multi-player) - removed the option to run all games (as this simplifies the code and is now done in a pipeline script) - improved cli usage `python3 scripts/cli.py run -g <game> -m <model1> [<model2>]` - pipeline scripts and README have been updated (no changes in the games)

Feat/refactor model pairing (#35)
7fc95710 · Philipp · GitHub · 46c140cc · 7fc95710 · 7fc95710
Unverified Commit 7fc95710 authored 1 year ago by Philipp Committed by GitHub 1 year ago
--- a/clemgame/benchmark.py
+++ b/clemgame/benchmark.py
 """ Main entry point """
+from typing import List
+
 import clemgame

 from datetime import datetime
@@ -20,40 +22,22 @@ def list_games():
        stdout_logger.info(" Game: %s -> %s", game.name, game.get_description())


-def run(game_name: str, temperature: float, model_name: str = None, experiment_name: str = None):
-    logger.info("Running benchmark for: %s (model_name=%s)", game_name,
-                model_name if model_name is not None else "see experiment configs")
+def run(game_name: str, temperature: float, models: List[str] = None, experiment_name: str = None):
    assert 0.0 <= temperature <= 1.0, "Temperature must be in [0.,1.]"
    if experiment_name:
        logger.info("Only running experiment: %s", experiment_name)
-    if game_name == "all" and model_name is not None:
-        if string_utils.is_pair_descriptor(model_name):
-            raise ValueError("'all' argument only allows self-play (single model arguments)."
-                             " Please provide individual model names e.g. model-a which is"
-                             " then automatically expanded to a dialogue pair for multi-player.")
-        games_list = load_benchmarks()
-    else:
-        games_list = [load_benchmark(game_name)]
-    total_games = len(games_list)
-    for idx, benchmark in enumerate(games_list):
-        try:
-            if experiment_name:
-                benchmark.filter_experiment.append(experiment_name)
-            stdout_logger.info(f"Run game {idx + 1} of {total_games}: {benchmark.name}")
-            time_start = datetime.now()
-            # checking for None here is important b.c. then experiment conf is used
-            if game_name == "all" and model_name is not None:  # automatically handle self-play
-                if benchmark.is_single_player():
-                    dialog_pair = model_name
-                else:  # multi-player
-                    dialog_pair = string_utils.to_pair_descriptor([model_name, model_name])
-            else:  # for particular games calls take the given argument directly (the user should know)
-                dialog_pair = model_name
-            benchmark.run(dialog_pair=dialog_pair, temperature=temperature)
-            time_end = datetime.now()
-            logger.info(f"Run {benchmark.name} took {str(time_end - time_start)}")
-        except Exception as e:
-            logger.error(e, exc_info=True)
+    try:
+        benchmark = load_benchmark(game_name)
+        logger.info("Running benchmark for: %s (models=%s)", game_name,
+                    models if models is not None else "see experiment configs")
+        if experiment_name:
+            benchmark.filter_experiment.append(experiment_name)
+        time_start = datetime.now()
+        benchmark.run(player_backends=models, temperature=temperature)
+        time_end = datetime.now()
+        logger.info(f"Run {benchmark.name} took {str(time_end - time_start)}")
+    except Exception as e:
+        logger.error(e, exc_info=True)


 def score(game_name: str, experiment_name: str = None):

--- a/clemgame/clemgame.py
+++ b/clemgame/clemgame.py
@@ -650,7 +650,7 @@ class GameBenchmark(GameResourceLocator):
                    stdout_logger.error(
                        f"{self.name}: '{error_count}' exceptions occurred: See clembench.log for details.")

-    def run(self, dialog_pair: str, temperature: float):
+    def run(self, player_backends: List[str], temperature: float):
        """
        Runs game-play on all game instances for a game.
        There must be an instances.json with the following structure:
@@ -693,11 +693,8 @@ class GameBenchmark(GameResourceLocator):
            # Determine dialogue partners: How often to run the experiment with different partners
            dialogue_partners: List[List[str]] = []

-            if dialog_pair:  # favor runtime argument over experiment config
-                if string_utils.is_pair_descriptor(dialog_pair):
-                    dialogue_partners = [string_utils.to_model_pair(dialog_pair)]
-                else:
-                    dialogue_partners = [[dialog_pair]]
+            if player_backends:  # favor runtime argument over experiment config
+                dialogue_partners = [player_backends]
            elif "dialogue_partners" in experiment:
                dialogue_partners = experiment["dialogue_partners"]
                self.logger.info(f"{self.name}: Detected 'dialogue_partners' in experiment config. "
@@ -705,27 +702,29 @@ class GameBenchmark(GameResourceLocator):

            if not dialogue_partners:
                message = (f"{self.name}: Neither 'dialogue_partners' set in experiment instance"
-                           f" nor 'model_name' given as run arg")
+                           f" nor 'models' given as run arg")
                stdout_logger.error(message)
                raise ValueError(message)

            for dialogue_pair in dialogue_partners:
-                if len(dialogue_pair) == 1 and self.is_single_player():
-                    model_name = dialogue_pair[0]
-                    dialogue_pair_desc = f"{model_name}-t{temperature}"
-                    stdout_logger.info(f"With single player: {dialogue_pair_desc}")
+                if self.is_single_player():
+                    if len(dialogue_pair) > 1:
+                        message = f"Too many player for singe-player game '{self.name}': '{len(dialogue_partners)}'"
+                        stdout_logger.error(message)
+                        raise ValueError(message)
+                    model_desc_0 = f"{dialogue_pair[0]}-t{temperature}"
                    # still we store to model--model dir (virtual self-play)
-                    dialogue_pair_desc = f"{dialogue_pair_desc}--{dialogue_pair_desc}"
-                elif len(dialogue_pair) == 2 and not self.is_single_player():
-                    dialogue_pair_desc = string_utils.to_pair_descriptor([f"{model_name}-t{temperature}"
-                                                                          for model_name in dialogue_pair])
-                    stdout_logger.info(f"With dialog partners: {dialogue_pair_desc}")
-                else:
-                    message = (f"Invalid model pairing {dialogue_pair}"
-                               f" for a {'single' if self.is_single_player() else 'multi'}-player game."
-                               f" For single-player expected only a single model, otherwise a pair.")
-                    stdout_logger.error(message)
-                    raise ValueError(message)
+                    dialogue_pair_desc = f"{model_desc_0}--{model_desc_0}"
+                else:  # 2-players
+                    if len(dialogue_pair) > 2:
+                        message = f"Too many player for two-player game '{self.name}': '{len(dialogue_partners)}'"
+                        stdout_logger.error(message)
+                        raise ValueError(message)
+                    if len(dialogue_pair) == 1:
+                        dialogue_pair.append(dialogue_pair[0])  # model expansion
+                    model_desc_0 = f"{dialogue_pair[0]}-t{temperature}"
+                    model_desc_1 = f"{dialogue_pair[1]}-t{temperature}"
+                    dialogue_pair_desc = f"{model_desc_0}--{model_desc_1}"
                episode_counter = 0

                self.logger.info("Activity: %s Experiment: %s Partners: %s Episode: %d",

--- a/docs/howto_add_games.md
+++ b/docs/howto_add_games.md
@@ -12,11 +12,17 @@ If you're completely new to this, it might make sense to look at two Jupyter not
 The benchmark is run for a particular game -- for example the taboo game -- using the follow command:  

 ```
-python3 scripts/cli.py -m gpt-3.5-turbo--gpt-3.5-turbo run taboo
+python3 scripts/cli.py run -g taboo -m gpt-3.5-turbo-1106
 ```

-From the call we already see that taboo is a two-player game because we need to provide a descriptor for two models.
-These models are supposed to play certain roles in the game, here a clue giver and a guesser. 
+_Note: when only a single model for a 2-player game is given, then clem will use this model for both players!_ 
+
+As taboo is a game of two players (a clue giver and a guesser) we could theoretically also let two different
+models play the game which would look like:
+
+```
+python3 scripts/cli.py run -g taboo -m gpt-3.5-turbo-1106 gpt-4-0613
+```

 ### GameBenchmark class

@@ -51,7 +57,7 @@ of the necessary plumbing and executes the main logic for a benchmark run (calli
 Aside: The return value of `get_description` is shown for the `python3 scripts/cli.py ls` command.

 Then the benchmark code checks if your game is single or multiplayer game (the default is multi-player), 
-so that the `-m gpt-3.5-turbo--gpt-3.5-turbo` option is properly handled. 
+so that the `-m gpt-3.5-turbo-1106` option is properly handled. 
 Then the `run(dialog_pair,temperature)` method is called which is already implemented by `GameBenchmark`.
 This is when the `GameMaster` becomes relevant (which should be returned by your `create_game_master()` factory method).

@@ -388,36 +394,37 @@ Add to the module a `master.py` that implements the `GameMaster`.
 ### Running experiments with your game

 ```
-python3 scripts/cli.py -m gpt-3.5-turbo [-e greet_en] run hellogame
+python3 scripts/cli.py run -g hellogame -m gpt-3.5-turbo-1106 [-e greet_en]
 ```

 Note: With -e you can specify specific experiments to run.

-This will create a records folder in your game directory as the following:
+This will create a results folder in the project root as follows:

 ```
-records
-└── gpt-3.5-turbo
-    └── greet_en
-        ├── episode_0
-        │ ├── instance_0.json
-        │ ├── interaction.json
-        │ └── transcript.html
-        ├── episode_1
-        │ ├── instance_1.json
-        │ ├── interaction.json
-        │ └── transcript.html
-        │ ...
-        └── experiment_greet_en.json
+results
+└── gpt-3.5-turbo-1106-t0.0--gpt-3.5-turbo-1106-t0.0
+    └── hellogame
+        └── 0_greet_en
+            ├── episode_0
+            │ ├── instance.json
+            │ ├── interaction.json
+            │ └── transcript.html
+            ├── episode_1
+            │ ├── instance.json
+            │ ├── interaction.json
+            │ └── transcript.html
+            │ ...
+            └── experiment_greet_en.json
 ```

-The top level is `records` followed by directories that mention the involved model (pairings).
+The top level is `results` followed by directories that mention the involved model (pairings).

 The model (pairing) sub-folders will contain a directory structure for each experiment
 and the experiments episodes (game plays).

 The episodes are defined by the game instances (from the `instances.json`) and
-contain the instance parameters `instance_n.json`, an `interaction.json` and a nice human-viewable `transcript.html`.
+contain the instance parameters `instance.json`, an `interaction.json` and a nice human-viewable `transcript.html`.

 The experiment folder also contains a `experiment_name.json` that contains the run parameters.


--- a/docs/howto_benchmark_workflow.md
+++ b/docs/howto_benchmark_workflow.md
@@ -7,16 +7,10 @@ Detailed documentation about setting up the virtual environment, installing libr
 The benchmark is run for a particular game with a particular model -- for example, the taboo game on GPT-3.5-turbo -- using the following command:  

 ```
-python3 scripts/cli.py -m gpt-3.5-turbo--gpt-3.5-turbo run taboo
+python3 scripts/cli.py run -g taboo -m gpt-3.5-turbo
 ```

-Or run the following command to run all existing games on the chosen model (GPT-3.5-turbo):
-
-```
-python3 scripts/cli.py -m gpt-3.5-turbo--gpt-3.5-turbo run all
-```
-
-Alternatively, the benchmark run can be scripted as follows to run multiple games and model combinations using the following bash script (here, only GPT-3.5-turbo and GPT-4 are chosen as references):
+Alternatively, the benchmark run can be scripted as follows to run multiple games various models (in self-play mode) using the following bash script (here, only GPT-3.5-turbo and GPT-4 are chosen as references):

 ```
 #!/bin/bash
@@ -30,24 +24,30 @@ game_runs=(
  # Single-player: privateshared
  "privateshared gpt-3.5-turbo"
  "privateshared gpt-4"
+  
  # Single-player: wordle
  "wordle gpt-3.5-turbo"
  "wordle gpt-4"
+  
  # Single-player: wordle_withclue
  "wordle_withclue gpt-3.5-turbo"
  "wordle_withclue gpt-4"
+  
  # Multi-player taboo
-  "taboo gpt-3.5-turbo--gpt-3.5-turbo"
-  "taboo gpt-4--gpt-4"
+  "taboo gpt-3.5-turbo"
+  "taboo gpt-4"
+  
  # Multi-player referencegame
-  "referencegame gpt-3.5-turbo--gpt-3.5-turbo"
-  "referencegame gpt-4--gpt-4"
+  "referencegame gpt-3.5-turbo"
+  "referencegame gpt-4"
+  
  # Multi-player imagegame
-  "imagegame gpt-3.5-turbo--gpt-3.5-turbo"
-  "imagegame gpt-4--gpt-4"
+  "imagegame gpt-3.5-turbo"
+  "imagegame gpt-4"
+  
  # Multi-player wordle_withcritic
-  "wordle_withcritic gpt-3.5-turbo--gpt-3.5-turbo"
-  "wordle_withcritic gpt-4--gpt-4"
+  "wordle_withcritic gpt-3.5-turbo"
+  "wordle_withcritic gpt-4"
 )
 total_runs=${#game_runs[@]}
 echo "Number of benchmark runs: $total_runs"
@@ -69,13 +69,25 @@ Once the benchmark runs are finished, the `results` folder will include all run
 Run the following command to generate transcriptions of the dialogues. The script generates LaTeX and HTML files of the dialogues under each episode of a particular experiment.

 ```
-python3 scripts/cli.py transcribe all
+python3 scripts/cli.py transcribe
+```
+
+The default is for `all` games. But we could also do this just for a single game
+
+```
+python3 scripts/cli.py transcribe -g taboo
 ```

 Next, run the scoring command that calculates turn & episode-specific metrics defined for each game. This script generates `scores.json` file stored under the same folder as transcripts and other files under a specific episode. 

 ```
-python3 scripts/cli.py score all
+python3 scripts/cli.py score 
+```
+
+The default is for `all` games. But we could also do this just for a single game
+
+```
+python3 scripts/cli.py score -g taboo
 ```

 ### Evaluate the Benchmark Run & Update the Leaderboard

--- a/docs/howto_run_benchmark.md
+++ b/docs/howto_run_benchmark.md
@@ -62,10 +62,15 @@ source prepare_path.sh
 Then run the cli script

 ```
-python3 scripts/cli.py -m gpt-3.5-turbo--gpt-3.5-turbo run taboo
+python3 scripts/cli.py run -g taboo -m gpt-3.5-turbo
 ```

-(The `-m` option tells the script which model to use; since taboo is a two player game, we need both partners to be specified here.)
+The `-m` option tells the script which model to use. Since taboo is a two player game, we could theoretically also 
+let two different models play the game which would look like:
+
+```
+python3 scripts/cli.py run -g taboo -m gpt-3.5-turbo-1106 gpt-4-0613
+```

 This should give you an output on the terminal that contains something like the following:

@@ -73,20 +78,21 @@ This should give you an output on the terminal that contains something like the
 Playing games: 100%|██████████████████████████████████| 20/20 [00:48<00:00,  2.41s/it]
 ```

-If that is the case, output (transcripts of the games played) will have been written to `results/taboo` (in the main directory of the code).
+If that is the case, output (transcripts of the games played) will have been written to 
+`results/gpt-3.5-turbo-t0.0--gpt-3.5-turbo-t0.0/taboo` (in the main directory of the code).

 Unfortunately, at the moment the code often fails silently, for example if model names are wrong, so make sure that you see the confirmation that the game actually has been played. Have a look at the file `clembench.log` if you suspect that something might be wrong.

 You can get more information about what you can do with the `cli` script via:

 ```
-python3 scripts/cli.py --help
+python3 scripts/cli.py run --help
 ```

 For example, you can use that script to get a more readable version of the game play jsons like so:

 ```
-python3 scripts/cli.py transcribe taboo
+python3 scripts/cli.py transcribe -g taboo
 ```

 After running this, the `results` directory will now hold html and LaTeX views of the transcripts.
@@ -95,23 +101,23 @@ After running this, the `results` directory will now hold html and LaTeX views o
 To run other game masters individually use the following scripts. Note some games (privateshared) are single player and some games can be multiplayer (taboo, referencegame, imagegame, wordle)

 ```
-python scripts/cli.py -m gpt-3.5-turbo run privateshared
+python scripts/cli.py run -g privateshared -m gpt-3.5-turbo 
 ```

 ```
-python scripts/cli.py -m gpt-3.5-turbo--gpt-3.5-turbo run taboo
+python scripts/cli.py run -g taboo -m gpt-3.5-turbo 
 ```

 ```
-python scripts/cli.py -m gpt-3.5-turbo--gpt-3.5-turbo run imagegame
+python scripts/cli.py run -g imagegame -m gpt-3.5-turbo 
 ```

 ```
-python scripts/cli.py -m gpt-3.5-turbo--gpt-3.5-turbo run referencegame
+python scripts/cli.py run -g referencegame -m gpt-3.5-turbo 
 ```

 ```
-python scripts/cli.py -m gpt-3.5-turbo--gpt-3.5-turbo run wordle
+python scripts/cli.py run -g wordle -m gpt-3.5-turbo 
 ```


@@ -139,25 +145,25 @@ with the format described in ```logdoc.md```.
 In order to generate the transcriptions of the dialogues, please run this command:

 ```
-python3 scripts/cli.py transcribe all
+python3 scripts/cli.py transcribe
 ```

 Or put a single game name (taboo, referencegame, imagegame, wordle, privateshared)

 ```
-python3 scripts/cli.py transcribe taboo
+python3 scripts/cli.py transcribe -g taboo
 ```

 Next, run this command to generate the scores of the dialogues:

 ```
-python3 scripts/cli.py score all
+python3 scripts/cli.py score
 ```

 Or put a single game name (taboo, referencegame, imagegame, wordle, privateshared)

 ```
-python3 scripts/cli.py score taboo
+python3 scripts/cli.py score -g taboo
 ```

 We provide an evaluation script at `evaluation/papereval.py` that produces a number of tables and visualizations for all games in the ```results/``` directory, which was used for the paper. To use this script, new models (their name abbreviation), metrics (their range) and game/model (their order) must be added manually to the constants in ```evaluation/evalutils.py```. Run the following to replicate the results in the paper or if you have new results:

--- a/docs/howto_run_games_locally.md
+++ b/docs/howto_run_games_locally.md
@@ -52,7 +52,7 @@ source prepare_path.sh
 Then run the cli script to run the `taboo` game on the `high_en` experiment using the pairs of `fsc-openchat-3.5-0106` models. You can replace the game and experiment names for your own use case.

 ```
-python3 scripts/cli.py -m fsc-openchat-3.5-0106--fsc-openchat-3.5-0106 -e high_en run taboo
+python3 scripts/cli.py run -g taboo -m fsc-openchat-3.5-0106 -e high_en
 ```

 (The `-m` option tells the script which model to use; since taboo is a two player game, we need both partners to be specified here.)
@@ -70,7 +70,7 @@ If that is the case, output (transcripts of the games played) will have been wri
 For example, you can use that script to get a more readable version of the game play jsons like so:

 ```
-python3 scripts/cli.py transcribe taboo
+python3 scripts/cli.py transcribe -g taboo
 ```

 After running this, the `results` directory will now hold html and LaTeX views of the transcripts for each episode.
@@ -78,7 +78,7 @@ After running this, the `results` directory will now hold html and LaTeX views o
 Next run the following to generate scores:

 ```
-python3 scripts/cli.py score taboo
+python3 scripts/cli.py score -g taboo
 ```

 # Evaluation

--- a/pipeline_clembench.sh
+++ b/pipeline_clembench.sh
@@ -26,37 +26,37 @@ game_runs=(
  "wordle_withclue luminous-supreme"
  "wordle_withclue gpt-4"
  # Multi-player taboo
-  "taboo text-davinci-003--text-davinci-003"
-  "taboo gpt-3.5-turbo--gpt-3.5-turbo"
-  "taboo claude-v1.3--claude-v1.3"
-  "taboo luminous-supreme--luminous-supreme"
-  "taboo gpt-4--gpt-4"
-  "taboo gpt-4--gpt-3.5-turbo"
-  "taboo gpt-3.5-turbo--gpt-4"
+  "taboo text-davinci-003"
+  "taboo gpt-3.5-turbo"
+  "taboo claude-v1.3"
+  "taboo luminous-supreme"
+  "taboo gpt-4"
+  "taboo gpt-4 gpt-3.5-turbo"
+  "taboo gpt-3.5-turbo gpt-4"
  # Multi-player referencegame
-  "referencegame text-davinci-003--text-davinci-003"
-  "referencegame gpt-3.5-turbo--gpt-3.5-turbo"
-  "referencegame claude-v1.3--claude-v1.3"
-  "referencegame luminous-supreme--luminous-supreme"
-  "referencegame gpt-4--gpt-4"
-  "referencegame gpt-4--gpt-3.5-turbo"
-  "referencegame gpt-3.5-turbo--gpt-4"
+  "referencegame text-davinci-003"
+  "referencegame gpt-3.5-turbo-"
+  "referencegame claude-v1.3"
+  "referencegame luminous-supreme"
+  "referencegame gpt-4"
+  "referencegame gpt-4 gpt-3.5-turbo"
+  "referencegame gpt-3.5-turbo gpt-4"
  # Multi-player imagegame
-  "imagegame text-davinci-003--text-davinci-003"
-  "imagegame gpt-3.5-turbo--gpt-3.5-turbo"
-  "imagegame claude-v1.3--claude-v1.3"
-  "imagegame luminous-supreme--luminous-supreme"
-  "imagegame gpt-4--gpt-4"
-  "imagegame gpt-4--gpt-3.5-turbo"
-  "imagegame gpt-3.5-turbo--gpt-4"
+  "imagegame text-davinci-003"
+  "imagegame gpt-3.5-turbo"
+  "imagegame claude-v1.3"
+  "imagegame luminous-supreme"
+  "imagegame gpt-4"
+  "imagegame gpt-4 gpt-3.5-turbo"
+  "imagegame gpt-3.5-turbo gpt-4"
  # Multi-player wordle_withcritic
-  "wordle_withcritic text-davinci-003--text-davinci-003"
-  "wordle_withcritic gpt-3.5-turbo--gpt-3.5-turbo"
-  "wordle_withcritic claude-v1.3--claude-v1.3"
-  "wordle_withcritic luminous-supreme--luminous-supreme"
-  "wordle_withcritic gpt-4--gpt-4"
-  "wordle_withcritic gpt-4--gpt-3.5-turbo"
-  "wordle_withcritic gpt-3.5-turbo--gpt-4"
+  "wordle_withcritic text-davinci-003"
+  "wordle_withcritic gpt-3.5-turbo"
+  "wordle_withcritic claude-v1.3"
+  "wordle_withcritic luminous-supreme"
+  "wordle_withcritic gpt-4"
+  "wordle_withcritic gpt-4 gpt-3.5-turbo"
+  "wordle_withcritic gpt-3.5-turbo gpt-4"
 )
 total_runs=${#game_runs[@]}
 echo "Number of benchmark runs: $total_runs"

--- a/pipeline_huggingfaces.sh
+++ b/pipeline_huggingfaces.sh
@@ -27,25 +27,25 @@ game_runs=(
  "wordle_withclue falcon-40b-instruct"
  "wordle_withclue oasst-sft-4-pythia-12b-epoch-3.5"
  # Multi-player taboo
-  "taboo koala-13B-HF--koala-13B-HF"
-  "taboo Wizard-Vicuna-13B-Uncensored-HF--Wizard-Vicuna-13B-Uncensored-HF"
-  "taboo falcon-40b-instruct--falcon-40b-instruct"
-  "taboo oasst-sft-4-pythia-12b-epoch-3.5--oasst-sft-4-pythia-12b-epoch-3.5"
+  "taboo koala-13B-HF"
+  "taboo Wizard-Vicuna-13B-Uncensored-HF"
+  "taboo falcon-40b-instruct"
+  "taboo oasst-sft-4-pythia-12b-epoch-3.5"
  # Multi-player referencegame
-  "referencegame koala-13B-HF--koala-13B-HF"
-  "referencegame Wizard-Vicuna-13B-Uncensored-HF--Wizard-Vicuna-13B-Uncensored-HF"
-  "referencegame falcon-40b-instruct--falcon-40b-instruct"
-  "referencegame oasst-sft-4-pythia-12b-epoch-3.5--oasst-sft-4-pythia-12b-epoch-3.5"
+  "referencegame koala-13B-HF"
+  "referencegame Wizard-Vicuna-13B-Uncensored-HF"
+  "referencegame falcon-40b-instruct"
+  "referencegame oasst-sft-4-pythia-12b-epoch-3.5"
  # Multi-player imagegame
-  "imagegame koala-13B-HF--koala-13B-HF"
-  "imagegame Wizard-Vicuna-13B-Uncensored-HF--Wizard-Vicuna-13B-Uncensored-HF"
-  "imagegame falcon-40b-instruct--falcon-40b-instruct"
-  "imagegame oasst-sft-4-pythia-12b-epoch-3.5--oasst-sft-4-pythia-12b-epoch-3.5"
+  "imagegame koala-13B-HF"
+  "imagegame Wizard-Vicuna-13B-Uncensored-HF"
+  "imagegame falcon-40b-instruct"
+  "imagegame oasst-sft-4-pythia-12b-epoch-3.5"
  # Multi-player wordle_withcritic
-  "wordle_withcritic koala-13B-HF--koala-13B-HF"
-  "wordle_withcritic Wizard-Vicuna-13B-Uncensored-HF--Wizard-Vicuna-13B-Uncensored-HF"
-  "wordle_withcritic falcon-40b-instruct--falcon-40b-instruct"
-  "wordle_withcritic oasst-sft-4-pythia-12b-epoch-3.5--oasst-sft-4-pythia-12b-epoch-3.5"
+  "wordle_withcritic koala-13B-HF"
+  "wordle_withcritic Wizard-Vicuna-13B-Uncensored-HF"
+  "wordle_withcritic falcon-40b-instruct"
+  "wordle_withcritic oasst-sft-4-pythia-12b-epoch-3.5"
 )
 total_runs=${#game_runs[@]}
 echo "Number of benchmark runs: $total_runs"

--- a/pipeline_llama2_hf.sh
+++ b/pipeline_llama2_hf.sh
@@ -26,13 +26,13 @@ game_runs=(
  # Single-player: wordle_withclue
  "wordle_withclue llama-2-13b-chat-hf"
  # Multi-player taboo
-  "taboo llama-2-13b-chat-hf--llama-2-13b-chat-hf"
+  "taboo llama-2-13b-chat-hf"
  # Multi-player referencegame
-  "referencegame llama-2-13b-chat-hf--llama-2-13b-chat-hf"
+  "referencegame llama-2-13b-chat-hf"
  # Multi-player imagegame
-  "imagegame llama-2-13b-chat-hf--llama-2-13b-chat-hf"
+  "imagegame llama-2-13b-chat-hf"
  # Multi-player wordle_withcritic
-  "wordle_withcritic llama-2-13b-chat-hf--llama-2-13b-chat-hf"
+  "wordle_withcritic llama-2-13b-chat-hf"
 )
 total_runs=${#game_runs[@]}
 echo "Number of benchmark runs: $total_runs"

--- a/run.sh
+++ b/run.sh
 #!/bin/bash

-if [ ! $# -eq 2 ]; then
-  echo "Please provide exactly two arguments: run.sh <game_name> <model_pair>"
+# Script to run the benchmark for a single model type (self-play) or single-player games
+# For example: run.sh taboo gpt-3
+
+if [ $# -lt 2 ]; then
+  echo "Please provide at least two arguments: run.sh <game_name> <player_0> [<player_1>]"
  exit 1
 fi

-arg_game="$1"
-arg_model="$2"
-
 # Load and prepare path
 # source venv/bin/activate
 source prepare_path.sh

+arg_game="$1"
+arg_model0="$2"
+
 # Set temperature to 0.0
-{ time python3 scripts/cli.py -m "$arg_model" -t 0.0 run "$arg_game"; } 2>&1 | tee runtime."$arg_game"."$arg_model".log
+arg_temp=0.0
+
+if [ $# -eq 2 ]; then
+  { time python3 scripts/cli.py run -g "$arg_game" -m "${arg_model0}" -t $arg_temp; } 2>&1 | tee runtime."${arg_game}"."${arg_model0}"--"${arg_model0}".log
+fi
+
+if [ $# -eq 3 ]; then
+  arg_model1="$3"
+  { time python3 scripts/cli.py run -g "$arg_game" -m "${arg_model0}" "${arg_model1}" -t $arg_temp; } 2>&1 | tee runtime."${arg_game}"."${arg_model0}"--"${arg_model1}".log
+fi
--- a/scripts/cli.py
+++ b/scripts/cli.py
@@ -8,23 +8,26 @@ from clemgame import benchmark
    To list available games: 
    $> python3 scripts/cli.py ls
    
-    To run all games:
-    $> python3 scripts/cli.py [-m "mock"] run all
+    To run a specific game with a single player:
+    $> python3 scripts/cli.py run -g privateshared -m mock
    
-    To run a specific game:
-    $> python3 scripts/cli.py [-m "mock"] run privateshared
+    To run a specific game with a two players:
+    $> python3 scripts/cli.py run -g taboo -m mock mock
+    
+    If the game supports model expansion (using the single specified model for all players):
+    $> python3 scripts/cli.py run -g taboo -m mock
    
    To score all games:
-    $> python3 scripts/cli.py score all
+    $> python3 scripts/cli.py score
    
    To score a specific game:
-    $> python3 scripts/cli.py score privateshared
+    $> python3 scripts/cli.py score -g privateshared
    
    To score all games:
-    $> python3 scripts/cli.py transcribe all
+    $> python3 scripts/cli.py transcribe
    
    To score a specific game:
-    $> python3 scripts/cli.py transcribe privateshared
+    $> python3 scripts/cli.py transcribe -g privateshared
 """


@@ -32,43 +35,54 @@ def main(args):
    if args.command_name == "ls":
        benchmark.list_games()
    if args.command_name == "run":
-        benchmark.run(args.game_name,
+        benchmark.run(args.game,
                      temperature=args.temperature,
-                      model_name=args.model_name,
+                      models=args.models,
                      experiment_name=args.experiment_name)
    if args.command_name == "score":
-        benchmark.score(args.game_name,
-                        experiment_name=args.experiment_name)
+        benchmark.score(args.game, experiment_name=args.experiment_name)
    if args.command_name == "transcribe":
-        benchmark.transcripts(args.game_name,
-                              experiment_name=args.experiment_name)
+        benchmark.transcripts(args.game, experiment_name=args.experiment_name)


 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument("-m", "--model_name", type=str,
-                        help="Assumes model names supported by the implemented backends."
-                             " Use 'mock--mock' to avoid using any backend."
-                             " Note: '--' to configure two dialogue partners e.g. gpt-3.5-turbo--gpt-3.5-turbo."
-                             " For single-player games like private-shared only provide one model e.g. gpt-3.5-turbo."
-                             " When a dialog pair is given for a single-player game, then an error is thrown."
-                             " When this option is not given, then the dialogue partners configured in the experiment"
-                             " are used."
-                             " Default: None.")
-    parser.add_argument("-e", "--experiment_name", type=str,
-                        help="Optional argument to only run a specific experiment")
-    parser.add_argument("-t", "--temperature", type=float, default=0.0,
-                        help="Argument to specify sampling temperature used for the whole benchmark run. Default: 0.0.")
    sub_parsers = parser.add_subparsers(dest="command_name")
    sub_parsers.add_parser("ls")
-    run_parser = sub_parsers.add_parser("run")
-    run_parser.add_argument("game_name", help="A specific game name (see ls) or 'all'."
-                                              " Important: 'all' only allows self-play for now. For this mode pass"
-                                              " only single model names e.g. model-a and then this will automatically"
-                                              " be expanded to model-a--model-a for multi-player games.")
-    run_parser = sub_parsers.add_parser("score")
-    run_parser.add_argument("game_name", help="A specific game name (see ls) or 'all'")
-    run_parser = sub_parsers.add_parser("transcribe")
-    run_parser.add_argument("game_name", help="A specific game name (see ls) or 'all'")
+
+    run_parser = sub_parsers.add_parser("run", formatter_class=argparse.RawTextHelpFormatter)
+    run_parser.add_argument("-m", "--models", type=str, nargs="*",
+                            help="""Assumes model names supported by the implemented backends.
+
+      To run a specific game with a single player:
+      $> python3 scripts/cli.py run -g privateshared -m mock
+
+      To run a specific game with a two players:
+      $> python3 scripts/cli.py run -g taboo -m mock mock
+
+      If the game supports model expansion (using the single specified model for all players):
+      $> python3 scripts/cli.py run -g taboo -m mock
+
+      When this option is not given, then the dialogue partners configured in the experiment are used. 
+      Default: None.""")
+    run_parser.add_argument("-t", "--temperature", type=float, default=0.0,
+                            help="Argument to specify sampling temperature for the models. Default: 0.0.")
+    run_parser.add_argument("-e", "--experiment_name", type=str,
+                            help="Optional argument to only run a specific experiment")
+    run_parser.add_argument("-g", "--game", type=str,
+                            required=True, help="A specific game name (see ls).")
+
+    score_parser = sub_parsers.add_parser("score")
+    score_parser.add_argument("-e", "--experiment_name", type=str,
+                              help="Optional argument to only run a specific experiment")
+    score_parser.add_argument("-g", "--game", type=str,
+                              help="A specific game name (see ls).", default="all")
+
+    transcribe_parser = sub_parsers.add_parser("transcribe")
+    transcribe_parser.add_argument("-e", "--experiment_name", type=str,
+                                   help="Optional argument to only run a specific experiment")
+    transcribe_parser.add_argument("-g", "--game", type=str,
+                                   help="A specific game name (see ls).", default="all")
+
    args = parser.parse_args()
    main(args)