diff --git a/clemgame/benchmark.py b/clemgame/benchmark.py
index 95b98c78ee4d68f77f5a723739d41ed3463beafe..74a72908fc012706dc5328a10e69bc3173e24fc4 100644
--- a/clemgame/benchmark.py
+++ b/clemgame/benchmark.py
@@ -1,4 +1,6 @@
 """ Main entry point """
+from typing import List
+
 import clemgame
 
 from datetime import datetime
@@ -20,40 +22,22 @@ def list_games():
         stdout_logger.info(" Game: %s -> %s", game.name, game.get_description())
 
 
-def run(game_name: str, temperature: float, model_name: str = None, experiment_name: str = None):
-    logger.info("Running benchmark for: %s (model_name=%s)", game_name,
-                model_name if model_name is not None else "see experiment configs")
+def run(game_name: str, temperature: float, models: List[str] = None, experiment_name: str = None):
     assert 0.0 <= temperature <= 1.0, "Temperature must be in [0.,1.]"
     if experiment_name:
         logger.info("Only running experiment: %s", experiment_name)
-    if game_name == "all" and model_name is not None:
-        if string_utils.is_pair_descriptor(model_name):
-            raise ValueError("'all' argument only allows self-play (single model arguments)."
-                             " Please provide individual model names e.g. model-a which is"
-                             " then automatically expanded to a dialogue pair for multi-player.")
-        games_list = load_benchmarks()
-    else:
-        games_list = [load_benchmark(game_name)]
-    total_games = len(games_list)
-    for idx, benchmark in enumerate(games_list):
-        try:
-            if experiment_name:
-                benchmark.filter_experiment.append(experiment_name)
-            stdout_logger.info(f"Run game {idx + 1} of {total_games}: {benchmark.name}")
-            time_start = datetime.now()
-            # checking for None here is important b.c. then experiment conf is used
-            if game_name == "all" and model_name is not None:  # automatically handle self-play
-                if benchmark.is_single_player():
-                    dialog_pair = model_name
-                else:  # multi-player
-                    dialog_pair = string_utils.to_pair_descriptor([model_name, model_name])
-            else:  # for particular games calls take the given argument directly (the user should know)
-                dialog_pair = model_name
-            benchmark.run(dialog_pair=dialog_pair, temperature=temperature)
-            time_end = datetime.now()
-            logger.info(f"Run {benchmark.name} took {str(time_end - time_start)}")
-        except Exception as e:
-            logger.error(e, exc_info=True)
+    try:
+        benchmark = load_benchmark(game_name)
+        logger.info("Running benchmark for: %s (models=%s)", game_name,
+                    models if models is not None else "see experiment configs")
+        if experiment_name:
+            benchmark.filter_experiment.append(experiment_name)
+        time_start = datetime.now()
+        benchmark.run(player_backends=models, temperature=temperature)
+        time_end = datetime.now()
+        logger.info(f"Run {benchmark.name} took {str(time_end - time_start)}")
+    except Exception as e:
+        logger.error(e, exc_info=True)
 
 
 def score(game_name: str, experiment_name: str = None):
diff --git a/clemgame/clemgame.py b/clemgame/clemgame.py
index e53436fb8cd32e822a230f4e1b4469739e3b66e6..10b11d72071032024461e694ca3ffb662109130e 100644
--- a/clemgame/clemgame.py
+++ b/clemgame/clemgame.py
@@ -650,7 +650,7 @@ class GameBenchmark(GameResourceLocator):
                     stdout_logger.error(
                         f"{self.name}: '{error_count}' exceptions occurred: See clembench.log for details.")
 
-    def run(self, dialog_pair: str, temperature: float):
+    def run(self, player_backends: List[str], temperature: float):
         """
         Runs game-play on all game instances for a game.
         There must be an instances.json with the following structure:
@@ -693,11 +693,8 @@ class GameBenchmark(GameResourceLocator):
             # Determine dialogue partners: How often to run the experiment with different partners
             dialogue_partners: List[List[str]] = []
 
-            if dialog_pair:  # favor runtime argument over experiment config
-                if string_utils.is_pair_descriptor(dialog_pair):
-                    dialogue_partners = [string_utils.to_model_pair(dialog_pair)]
-                else:
-                    dialogue_partners = [[dialog_pair]]
+            if player_backends:  # favor runtime argument over experiment config
+                dialogue_partners = [player_backends]
             elif "dialogue_partners" in experiment:
                 dialogue_partners = experiment["dialogue_partners"]
                 self.logger.info(f"{self.name}: Detected 'dialogue_partners' in experiment config. "
@@ -705,27 +702,29 @@ class GameBenchmark(GameResourceLocator):
 
             if not dialogue_partners:
                 message = (f"{self.name}: Neither 'dialogue_partners' set in experiment instance"
-                           f" nor 'model_name' given as run arg")
+                           f" nor 'models' given as run arg")
                 stdout_logger.error(message)
                 raise ValueError(message)
 
             for dialogue_pair in dialogue_partners:
-                if len(dialogue_pair) == 1 and self.is_single_player():
-                    model_name = dialogue_pair[0]
-                    dialogue_pair_desc = f"{model_name}-t{temperature}"
-                    stdout_logger.info(f"With single player: {dialogue_pair_desc}")
+                if self.is_single_player():
+                    if len(dialogue_pair) > 1:
+                        message = f"Too many player for singe-player game '{self.name}': '{len(dialogue_partners)}'"
+                        stdout_logger.error(message)
+                        raise ValueError(message)
+                    model_desc_0 = f"{dialogue_pair[0]}-t{temperature}"
                     # still we store to model--model dir (virtual self-play)
-                    dialogue_pair_desc = f"{dialogue_pair_desc}--{dialogue_pair_desc}"
-                elif len(dialogue_pair) == 2 and not self.is_single_player():
-                    dialogue_pair_desc = string_utils.to_pair_descriptor([f"{model_name}-t{temperature}"
-                                                                          for model_name in dialogue_pair])
-                    stdout_logger.info(f"With dialog partners: {dialogue_pair_desc}")
-                else:
-                    message = (f"Invalid model pairing {dialogue_pair}"
-                               f" for a {'single' if self.is_single_player() else 'multi'}-player game."
-                               f" For single-player expected only a single model, otherwise a pair.")
-                    stdout_logger.error(message)
-                    raise ValueError(message)
+                    dialogue_pair_desc = f"{model_desc_0}--{model_desc_0}"
+                else:  # 2-players
+                    if len(dialogue_pair) > 2:
+                        message = f"Too many player for two-player game '{self.name}': '{len(dialogue_partners)}'"
+                        stdout_logger.error(message)
+                        raise ValueError(message)
+                    if len(dialogue_pair) == 1:
+                        dialogue_pair.append(dialogue_pair[0])  # model expansion
+                    model_desc_0 = f"{dialogue_pair[0]}-t{temperature}"
+                    model_desc_1 = f"{dialogue_pair[1]}-t{temperature}"
+                    dialogue_pair_desc = f"{model_desc_0}--{model_desc_1}"
                 episode_counter = 0
 
                 self.logger.info("Activity: %s Experiment: %s Partners: %s Episode: %d",
diff --git a/docs/howto_add_games.md b/docs/howto_add_games.md
index 9a32139fe8d077f971ce12fb1ac01695ffcc9643..6b89314daedae60d38c84f19ffd0fb467173bfa7 100644
--- a/docs/howto_add_games.md
+++ b/docs/howto_add_games.md
@@ -12,11 +12,17 @@ If you're completely new to this, it might make sense to look at two Jupyter not
 The benchmark is run for a particular game -- for example the taboo game -- using the follow command:  
 
 ```
-python3 scripts/cli.py -m gpt-3.5-turbo--gpt-3.5-turbo run taboo
+python3 scripts/cli.py run -g taboo -m gpt-3.5-turbo-1106
 ```
 
-From the call we already see that taboo is a two-player game because we need to provide a descriptor for two models.
-These models are supposed to play certain roles in the game, here a clue giver and a guesser. 
+_Note: when only a single model for a 2-player game is given, then clem will use this model for both players!_ 
+
+As taboo is a game of two players (a clue giver and a guesser) we could theoretically also let two different
+models play the game which would look like:
+
+```
+python3 scripts/cli.py run -g taboo -m gpt-3.5-turbo-1106 gpt-4-0613
+```
 
 ### GameBenchmark class
 
@@ -51,7 +57,7 @@ of the necessary plumbing and executes the main logic for a benchmark run (calli
 Aside: The return value of `get_description` is shown for the `python3 scripts/cli.py ls` command.
 
 Then the benchmark code checks if your game is single or multiplayer game (the default is multi-player), 
-so that the `-m gpt-3.5-turbo--gpt-3.5-turbo` option is properly handled. 
+so that the `-m gpt-3.5-turbo-1106` option is properly handled. 
 Then the `run(dialog_pair,temperature)` method is called which is already implemented by `GameBenchmark`.
 This is when the `GameMaster` becomes relevant (which should be returned by your `create_game_master()` factory method).
 
@@ -388,36 +394,37 @@ Add to the module a `master.py` that implements the `GameMaster`.
 ### Running experiments with your game
 
 ```
-python3 scripts/cli.py -m gpt-3.5-turbo [-e greet_en] run hellogame
+python3 scripts/cli.py run -g hellogame -m gpt-3.5-turbo-1106 [-e greet_en]
 ```
 
 Note: With -e you can specify specific experiments to run.
 
-This will create a records folder in your game directory as the following:
+This will create a results folder in the project root as follows:
 
 ```
-records
-â””â”€â”€ gpt-3.5-turbo
-    â””â”€â”€ greet_en
-        â”œâ”€â”€ episode_0
-        â”‚ â”œâ”€â”€ instance_0.json
-        â”‚ â”œâ”€â”€ interaction.json
-        â”‚ â””â”€â”€ transcript.html
-        â”œâ”€â”€ episode_1
-        â”‚ â”œâ”€â”€ instance_1.json
-        â”‚ â”œâ”€â”€ interaction.json
-        â”‚ â””â”€â”€ transcript.html
-        â”‚ ...
-        â””â”€â”€ experiment_greet_en.json
+results
+â””â”€â”€ gpt-3.5-turbo-1106-t0.0--gpt-3.5-turbo-1106-t0.0
+    â””â”€â”€ hellogame
+        â””â”€â”€ 0_greet_en
+            â”œâ”€â”€ episode_0
+            â”‚ â”œâ”€â”€ instance.json
+            â”‚ â”œâ”€â”€ interaction.json
+            â”‚ â””â”€â”€ transcript.html
+            â”œâ”€â”€ episode_1
+            â”‚ â”œâ”€â”€ instance.json
+            â”‚ â”œâ”€â”€ interaction.json
+            â”‚ â””â”€â”€ transcript.html
+            â”‚ ...
+            â””â”€â”€ experiment_greet_en.json
 ```
 
-The top level is `records` followed by directories that mention the involved model (pairings).
+The top level is `results` followed by directories that mention the involved model (pairings).
 
 The model (pairing) sub-folders will contain a directory structure for each experiment
 and the experiments episodes (game plays).
 
 The episodes are defined by the game instances (from the `instances.json`) and
-contain the instance parameters `instance_n.json`, an `interaction.json` and a nice human-viewable `transcript.html`.
+contain the instance parameters `instance.json`, an `interaction.json` and a nice human-viewable `transcript.html`.
 
 The experiment folder also contains a `experiment_name.json` that contains the run parameters.
 
diff --git a/docs/howto_benchmark_workflow.md b/docs/howto_benchmark_workflow.md
index 749e92a60a49ea6a927367247580011c7936209b..c5ec8b98b89a408c7db7e5e81f1a118bf50692b2 100644
--- a/docs/howto_benchmark_workflow.md
+++ b/docs/howto_benchmark_workflow.md
@@ -7,16 +7,10 @@ Detailed documentation about setting up the virtual environment, installing libr
 The benchmark is run for a particular game with a particular model -- for example, the taboo game on GPT-3.5-turbo -- using the following command:  
 
 ```
-python3 scripts/cli.py -m gpt-3.5-turbo--gpt-3.5-turbo run taboo
+python3 scripts/cli.py run -g taboo -m gpt-3.5-turbo
 ```
 
-Or run the following command to run all existing games on the chosen model (GPT-3.5-turbo):
-
-```
-python3 scripts/cli.py -m gpt-3.5-turbo--gpt-3.5-turbo run all
-```
-
-Alternatively, the benchmark run can be scripted as follows to run multiple games and model combinations using the following bash script (here, only GPT-3.5-turbo and GPT-4 are chosen as references):
+Alternatively, the benchmark run can be scripted as follows to run multiple games various models (in self-play mode) using the following bash script (here, only GPT-3.5-turbo and GPT-4 are chosen as references):
 
 ```
 #!/bin/bash
@@ -30,24 +24,30 @@ game_runs=(
   # Single-player: privateshared
   "privateshared gpt-3.5-turbo"
   "privateshared gpt-4"
+  
   # Single-player: wordle
   "wordle gpt-3.5-turbo"
   "wordle gpt-4"
+  
   # Single-player: wordle_withclue
   "wordle_withclue gpt-3.5-turbo"
   "wordle_withclue gpt-4"
+  
   # Multi-player taboo
-  "taboo gpt-3.5-turbo--gpt-3.5-turbo"
-  "taboo gpt-4--gpt-4"
+  "taboo gpt-3.5-turbo"
+  "taboo gpt-4"
+  
   # Multi-player referencegame
-  "referencegame gpt-3.5-turbo--gpt-3.5-turbo"
-  "referencegame gpt-4--gpt-4"
+  "referencegame gpt-3.5-turbo"
+  "referencegame gpt-4"
+  
   # Multi-player imagegame
-  "imagegame gpt-3.5-turbo--gpt-3.5-turbo"
-  "imagegame gpt-4--gpt-4"
+  "imagegame gpt-3.5-turbo"
+  "imagegame gpt-4"
+  
   # Multi-player wordle_withcritic
-  "wordle_withcritic gpt-3.5-turbo--gpt-3.5-turbo"
-  "wordle_withcritic gpt-4--gpt-4"
+  "wordle_withcritic gpt-3.5-turbo"
+  "wordle_withcritic gpt-4"
 )
 total_runs=${#game_runs[@]}
 echo "Number of benchmark runs: $total_runs"
@@ -69,13 +69,25 @@ Once the benchmark runs are finished, the `results` folder will include all run
 Run the following command to generate transcriptions of the dialogues. The script generates LaTeX and HTML files of the dialogues under each episode of a particular experiment.
 
 ```
-python3 scripts/cli.py transcribe all
+python3 scripts/cli.py transcribe
+```
+
+The default is for `all` games. But we could also do this just for a single game
+
+```
+python3 scripts/cli.py transcribe -g taboo
 ```
 
 Next, run the scoring command that calculates turn & episode-specific metrics defined for each game. This script generates `scores.json` file stored under the same folder as transcripts and other files under a specific episode. 
 
 ```
-python3 scripts/cli.py score all
+python3 scripts/cli.py score 
+```
+
+The default is for `all` games. But we could also do this just for a single game
+
+```
+python3 scripts/cli.py score -g taboo
 ```
 
 ### Evaluate the Benchmark Run & Update the Leaderboard
diff --git a/docs/howto_run_benchmark.md b/docs/howto_run_benchmark.md
index eee61c4e39bbf21e7ddbf320eaa09abc5bf431be..364e3417dfb51c2cbfc168573a4b1782966cf574 100644
--- a/docs/howto_run_benchmark.md
+++ b/docs/howto_run_benchmark.md
@@ -62,10 +62,15 @@ source prepare_path.sh
 Then run the cli script
 
 ```
-python3 scripts/cli.py -m gpt-3.5-turbo--gpt-3.5-turbo run taboo
+python3 scripts/cli.py run -g taboo -m gpt-3.5-turbo
 ```
 
-(The `-m` option tells the script which model to use; since taboo is a two player game, we need both partners to be specified here.)
+The `-m` option tells the script which model to use. Since taboo is a two player game, we could theoretically also 
+let two different models play the game which would look like:
+
+```
+python3 scripts/cli.py run -g taboo -m gpt-3.5-turbo-1106 gpt-4-0613
+```
 
 This should give you an output on the terminal that contains something like the following:
 
@@ -73,20 +78,21 @@ This should give you an output on the terminal that contains something like the
 Playing games: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 20/20 [00:48<00:00,  2.41s/it]
 ```
 
-If that is the case, output (transcripts of the games played) will have been written to `results/taboo` (in the main directory of the code).
+If that is the case, output (transcripts of the games played) will have been written to 
+`results/gpt-3.5-turbo-t0.0--gpt-3.5-turbo-t0.0/taboo` (in the main directory of the code).
 
 Unfortunately, at the moment the code often fails silently, for example if model names are wrong, so make sure that you see the confirmation that the game actually has been played. Have a look at the file `clembench.log` if you suspect that something might be wrong.
 
 You can get more information about what you can do with the `cli` script via:
 
 ```
-python3 scripts/cli.py --help
+python3 scripts/cli.py run --help
 ```
 
 For example, you can use that script to get a more readable version of the game play jsons like so:
 
 ```
-python3 scripts/cli.py transcribe taboo
+python3 scripts/cli.py transcribe -g taboo
 ```
 
 After running this, the `results` directory will now hold html and LaTeX views of the transcripts.
@@ -95,23 +101,23 @@ After running this, the `results` directory will now hold html and LaTeX views o
 To run other game masters individually use the following scripts. Note some games (privateshared) are single player and some games can be multiplayer (taboo, referencegame, imagegame, wordle)
 
 ```
-python scripts/cli.py -m gpt-3.5-turbo run privateshared
+python scripts/cli.py run -g privateshared -m gpt-3.5-turbo 
 ```
 
 ```
-python scripts/cli.py -m gpt-3.5-turbo--gpt-3.5-turbo run taboo
+python scripts/cli.py run -g taboo -m gpt-3.5-turbo 
 ```
 
 ```
-python scripts/cli.py -m gpt-3.5-turbo--gpt-3.5-turbo run imagegame
+python scripts/cli.py run -g imagegame -m gpt-3.5-turbo 
 ```
 
 ```
-python scripts/cli.py -m gpt-3.5-turbo--gpt-3.5-turbo run referencegame
+python scripts/cli.py run -g referencegame -m gpt-3.5-turbo 
 ```
 
 ```
-python scripts/cli.py -m gpt-3.5-turbo--gpt-3.5-turbo run wordle
+python scripts/cli.py run -g wordle -m gpt-3.5-turbo 
 ```
 
 
@@ -139,25 +145,25 @@ with the format described in ```logdoc.md```.
 In order to generate the transcriptions of the dialogues, please run this command:
 
 ```
-python3 scripts/cli.py transcribe all
+python3 scripts/cli.py transcribe
 ```
 
 Or put a single game name (taboo, referencegame, imagegame, wordle, privateshared)
 
 ```
-python3 scripts/cli.py transcribe taboo
+python3 scripts/cli.py transcribe -g taboo
 ```
 
 Next, run this command to generate the scores of the dialogues:
 
 ```
-python3 scripts/cli.py score all
+python3 scripts/cli.py score
 ```
 
 Or put a single game name (taboo, referencegame, imagegame, wordle, privateshared)
 
 ```
-python3 scripts/cli.py score taboo
+python3 scripts/cli.py score -g taboo
 ```
 
 We provide an evaluation script at `evaluation/papereval.py` that produces a number of tables and visualizations for all games in the ```results/``` directory, which was used for the paper. To use this script, new models (their name abbreviation), metrics (their range) and game/model (their order) must be added manually to the constants in ```evaluation/evalutils.py```. Run the following to replicate the results in the paper or if you have new results:
diff --git a/docs/howto_run_games_locally.md b/docs/howto_run_games_locally.md
index b03bc4b2d47667c4780961433c71a40677c02996..edee71b3c89f6b6ab63feb74beafd4c03faec488 100644
--- a/docs/howto_run_games_locally.md
+++ b/docs/howto_run_games_locally.md
@@ -52,7 +52,7 @@ source prepare_path.sh
 Then run the cli script to run the `taboo` game on the `high_en` experiment using the pairs of `fsc-openchat-3.5-0106` models. You can replace the game and experiment names for your own use case.
 
 ```
-python3 scripts/cli.py -m fsc-openchat-3.5-0106--fsc-openchat-3.5-0106 -e high_en run taboo
+python3 scripts/cli.py run -g taboo -m fsc-openchat-3.5-0106 -e high_en
 ```
 
 (The `-m` option tells the script which model to use; since taboo is a two player game, we need both partners to be specified here.)
@@ -70,7 +70,7 @@ If that is the case, output (transcripts of the games played) will have been wri
 For example, you can use that script to get a more readable version of the game play jsons like so:
 
 ```
-python3 scripts/cli.py transcribe taboo
+python3 scripts/cli.py transcribe -g taboo
 ```
 
 After running this, the `results` directory will now hold html and LaTeX views of the transcripts for each episode.
@@ -78,7 +78,7 @@ After running this, the `results` directory will now hold html and LaTeX views o
 Next run the following to generate scores:
 
 ```
-python3 scripts/cli.py score taboo
+python3 scripts/cli.py score -g taboo
 ```
 
 # Evaluation
diff --git a/pipeline_clembench.sh b/pipeline_clembench.sh
index 5d9e1a86c9b9d45c6a9bac1daccd63e299abc151..42639f62b180862a5796e87f7ef035d01631ab46 100755
--- a/pipeline_clembench.sh
+++ b/pipeline_clembench.sh
@@ -26,37 +26,37 @@ game_runs=(
   "wordle_withclue luminous-supreme"
   "wordle_withclue gpt-4"
   # Multi-player taboo
-  "taboo text-davinci-003--text-davinci-003"
-  "taboo gpt-3.5-turbo--gpt-3.5-turbo"
-  "taboo claude-v1.3--claude-v1.3"
-  "taboo luminous-supreme--luminous-supreme"
-  "taboo gpt-4--gpt-4"
-  "taboo gpt-4--gpt-3.5-turbo"
-  "taboo gpt-3.5-turbo--gpt-4"
+  "taboo text-davinci-003"
+  "taboo gpt-3.5-turbo"
+  "taboo claude-v1.3"
+  "taboo luminous-supreme"
+  "taboo gpt-4"
+  "taboo gpt-4 gpt-3.5-turbo"
+  "taboo gpt-3.5-turbo gpt-4"
   # Multi-player referencegame
-  "referencegame text-davinci-003--text-davinci-003"
-  "referencegame gpt-3.5-turbo--gpt-3.5-turbo"
-  "referencegame claude-v1.3--claude-v1.3"
-  "referencegame luminous-supreme--luminous-supreme"
-  "referencegame gpt-4--gpt-4"
-  "referencegame gpt-4--gpt-3.5-turbo"
-  "referencegame gpt-3.5-turbo--gpt-4"
+  "referencegame text-davinci-003"
+  "referencegame gpt-3.5-turbo-"
+  "referencegame claude-v1.3"
+  "referencegame luminous-supreme"
+  "referencegame gpt-4"
+  "referencegame gpt-4 gpt-3.5-turbo"
+  "referencegame gpt-3.5-turbo gpt-4"
   # Multi-player imagegame
-  "imagegame text-davinci-003--text-davinci-003"
-  "imagegame gpt-3.5-turbo--gpt-3.5-turbo"
-  "imagegame claude-v1.3--claude-v1.3"
-  "imagegame luminous-supreme--luminous-supreme"
-  "imagegame gpt-4--gpt-4"
-  "imagegame gpt-4--gpt-3.5-turbo"
-  "imagegame gpt-3.5-turbo--gpt-4"
+  "imagegame text-davinci-003"
+  "imagegame gpt-3.5-turbo"
+  "imagegame claude-v1.3"
+  "imagegame luminous-supreme"
+  "imagegame gpt-4"
+  "imagegame gpt-4 gpt-3.5-turbo"
+  "imagegame gpt-3.5-turbo gpt-4"
   # Multi-player wordle_withcritic
-  "wordle_withcritic text-davinci-003--text-davinci-003"
-  "wordle_withcritic gpt-3.5-turbo--gpt-3.5-turbo"
-  "wordle_withcritic claude-v1.3--claude-v1.3"
-  "wordle_withcritic luminous-supreme--luminous-supreme"
-  "wordle_withcritic gpt-4--gpt-4"
-  "wordle_withcritic gpt-4--gpt-3.5-turbo"
-  "wordle_withcritic gpt-3.5-turbo--gpt-4"
+  "wordle_withcritic text-davinci-003"
+  "wordle_withcritic gpt-3.5-turbo"
+  "wordle_withcritic claude-v1.3"
+  "wordle_withcritic luminous-supreme"
+  "wordle_withcritic gpt-4"
+  "wordle_withcritic gpt-4 gpt-3.5-turbo"
+  "wordle_withcritic gpt-3.5-turbo gpt-4"
 )
 total_runs=${#game_runs[@]}
 echo "Number of benchmark runs: $total_runs"
diff --git a/pipeline_huggingfaces.sh b/pipeline_huggingfaces.sh
index 9179cffab8b212f5247acf1ffa54d97c6dce472d..76cedc3c315628586b8212e4f6735e6638c7226b 100755
--- a/pipeline_huggingfaces.sh
+++ b/pipeline_huggingfaces.sh
@@ -27,25 +27,25 @@ game_runs=(
   "wordle_withclue falcon-40b-instruct"
   "wordle_withclue oasst-sft-4-pythia-12b-epoch-3.5"
   # Multi-player taboo
-  "taboo koala-13B-HF--koala-13B-HF"
-  "taboo Wizard-Vicuna-13B-Uncensored-HF--Wizard-Vicuna-13B-Uncensored-HF"
-  "taboo falcon-40b-instruct--falcon-40b-instruct"
-  "taboo oasst-sft-4-pythia-12b-epoch-3.5--oasst-sft-4-pythia-12b-epoch-3.5"
+  "taboo koala-13B-HF"
+  "taboo Wizard-Vicuna-13B-Uncensored-HF"
+  "taboo falcon-40b-instruct"
+  "taboo oasst-sft-4-pythia-12b-epoch-3.5"
   # Multi-player referencegame
-  "referencegame koala-13B-HF--koala-13B-HF"
-  "referencegame Wizard-Vicuna-13B-Uncensored-HF--Wizard-Vicuna-13B-Uncensored-HF"
-  "referencegame falcon-40b-instruct--falcon-40b-instruct"
-  "referencegame oasst-sft-4-pythia-12b-epoch-3.5--oasst-sft-4-pythia-12b-epoch-3.5"
+  "referencegame koala-13B-HF"
+  "referencegame Wizard-Vicuna-13B-Uncensored-HF"
+  "referencegame falcon-40b-instruct"
+  "referencegame oasst-sft-4-pythia-12b-epoch-3.5"
   # Multi-player imagegame
-  "imagegame koala-13B-HF--koala-13B-HF"
-  "imagegame Wizard-Vicuna-13B-Uncensored-HF--Wizard-Vicuna-13B-Uncensored-HF"
-  "imagegame falcon-40b-instruct--falcon-40b-instruct"
-  "imagegame oasst-sft-4-pythia-12b-epoch-3.5--oasst-sft-4-pythia-12b-epoch-3.5"
+  "imagegame koala-13B-HF"
+  "imagegame Wizard-Vicuna-13B-Uncensored-HF"
+  "imagegame falcon-40b-instruct"
+  "imagegame oasst-sft-4-pythia-12b-epoch-3.5"
   # Multi-player wordle_withcritic
-  "wordle_withcritic koala-13B-HF--koala-13B-HF"
-  "wordle_withcritic Wizard-Vicuna-13B-Uncensored-HF--Wizard-Vicuna-13B-Uncensored-HF"
-  "wordle_withcritic falcon-40b-instruct--falcon-40b-instruct"
-  "wordle_withcritic oasst-sft-4-pythia-12b-epoch-3.5--oasst-sft-4-pythia-12b-epoch-3.5"
+  "wordle_withcritic koala-13B-HF"
+  "wordle_withcritic Wizard-Vicuna-13B-Uncensored-HF"
+  "wordle_withcritic falcon-40b-instruct"
+  "wordle_withcritic oasst-sft-4-pythia-12b-epoch-3.5"
 )
 total_runs=${#game_runs[@]}
 echo "Number of benchmark runs: $total_runs"
diff --git a/pipeline_llama2_hf.sh b/pipeline_llama2_hf.sh
index 28b7f3fbe83a874c3d416a43f73b1b6e2eea295e..da8049640d0d470002f5dad841b7f0a6d234633c 100644
--- a/pipeline_llama2_hf.sh
+++ b/pipeline_llama2_hf.sh
@@ -26,13 +26,13 @@ game_runs=(
   # Single-player: wordle_withclue
   "wordle_withclue llama-2-13b-chat-hf"
   # Multi-player taboo
-  "taboo llama-2-13b-chat-hf--llama-2-13b-chat-hf"
+  "taboo llama-2-13b-chat-hf"
   # Multi-player referencegame
-  "referencegame llama-2-13b-chat-hf--llama-2-13b-chat-hf"
+  "referencegame llama-2-13b-chat-hf"
   # Multi-player imagegame
-  "imagegame llama-2-13b-chat-hf--llama-2-13b-chat-hf"
+  "imagegame llama-2-13b-chat-hf"
   # Multi-player wordle_withcritic
-  "wordle_withcritic llama-2-13b-chat-hf--llama-2-13b-chat-hf"
+  "wordle_withcritic llama-2-13b-chat-hf"
 )
 total_runs=${#game_runs[@]}
 echo "Number of benchmark runs: $total_runs"
diff --git a/run.sh b/run.sh
index 2efc889d252e6a4b6c40a10728c5d21b47b63d4d..7f4b11af8d87d8e5500e4b27e6d4dfd9d79a920f 100755
--- a/run.sh
+++ b/run.sh
@@ -1,16 +1,28 @@
 #!/bin/bash
 
-if [ ! $# -eq 2 ]; then
-  echo "Please provide exactly two arguments: run.sh <game_name> <model_pair>"
+# Script to run the benchmark for a single model type (self-play) or single-player games
+# For example: run.sh taboo gpt-3
+
+if [ $# -lt 2 ]; then
+  echo "Please provide at least two arguments: run.sh <game_name> <player_0> [<player_1>]"
   exit 1
 fi
 
-arg_game="$1"
-arg_model="$2"
-
 # Load and prepare path
 # source venv/bin/activate
 source prepare_path.sh
 
+arg_game="$1"
+arg_model0="$2"
+
 # Set temperature to 0.0
-{ time python3 scripts/cli.py -m "$arg_model" -t 0.0 run "$arg_game"; } 2>&1 | tee runtime."$arg_game"."$arg_model".log
+arg_temp=0.0
+
+if [ $# -eq 2 ]; then
+  { time python3 scripts/cli.py run -g "$arg_game" -m "${arg_model0}" -t $arg_temp; } 2>&1 | tee runtime."${arg_game}"."${arg_model0}"--"${arg_model0}".log
+fi
+
+if [ $# -eq 3 ]; then
+  arg_model1="$3"
+  { time python3 scripts/cli.py run -g "$arg_game" -m "${arg_model0}" "${arg_model1}" -t $arg_temp; } 2>&1 | tee runtime."${arg_game}"."${arg_model0}"--"${arg_model1}".log
+fi
diff --git a/scripts/cli.py b/scripts/cli.py
index 299f214d738f6b369e4fb37354f0ae7c5e7d1346..406af5e1658b3f3e32919aa4b9f9d45818e6aa40 100644
--- a/scripts/cli.py
+++ b/scripts/cli.py
@@ -8,23 +8,26 @@ from clemgame import benchmark
     To list available games: 
     $> python3 scripts/cli.py ls
     
-    To run all games:
-    $> python3 scripts/cli.py [-m "mock"] run all
+    To run a specific game with a single player:
+    $> python3 scripts/cli.py run -g privateshared -m mock
     
-    To run a specific game:
-    $> python3 scripts/cli.py [-m "mock"] run privateshared
+    To run a specific game with a two players:
+    $> python3 scripts/cli.py run -g taboo -m mock mock
+    
+    If the game supports model expansion (using the single specified model for all players):
+    $> python3 scripts/cli.py run -g taboo -m mock
     
     To score all games:
-    $> python3 scripts/cli.py score all
+    $> python3 scripts/cli.py score
     
     To score a specific game:
-    $> python3 scripts/cli.py score privateshared
+    $> python3 scripts/cli.py score -g privateshared
     
     To score all games:
-    $> python3 scripts/cli.py transcribe all
+    $> python3 scripts/cli.py transcribe
     
     To score a specific game:
-    $> python3 scripts/cli.py transcribe privateshared
+    $> python3 scripts/cli.py transcribe -g privateshared
 """
 
 
@@ -32,43 +35,54 @@ def main(args):
     if args.command_name == "ls":
         benchmark.list_games()
     if args.command_name == "run":
-        benchmark.run(args.game_name,
+        benchmark.run(args.game,
                       temperature=args.temperature,
-                      model_name=args.model_name,
+                      models=args.models,
                       experiment_name=args.experiment_name)
     if args.command_name == "score":
-        benchmark.score(args.game_name,
-                        experiment_name=args.experiment_name)
+        benchmark.score(args.game, experiment_name=args.experiment_name)
     if args.command_name == "transcribe":
-        benchmark.transcripts(args.game_name,
-                              experiment_name=args.experiment_name)
+        benchmark.transcripts(args.game, experiment_name=args.experiment_name)
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("-m", "--model_name", type=str,
-                        help="Assumes model names supported by the implemented backends."
-                             " Use 'mock--mock' to avoid using any backend."
-                             " Note: '--' to configure two dialogue partners e.g. gpt-3.5-turbo--gpt-3.5-turbo."
-                             " For single-player games like private-shared only provide one model e.g. gpt-3.5-turbo."
-                             " When a dialog pair is given for a single-player game, then an error is thrown."
-                             " When this option is not given, then the dialogue partners configured in the experiment"
-                             " are used."
-                             " Default: None.")
-    parser.add_argument("-e", "--experiment_name", type=str,
-                        help="Optional argument to only run a specific experiment")
-    parser.add_argument("-t", "--temperature", type=float, default=0.0,
-                        help="Argument to specify sampling temperature used for the whole benchmark run. Default: 0.0.")
     sub_parsers = parser.add_subparsers(dest="command_name")
     sub_parsers.add_parser("ls")
-    run_parser = sub_parsers.add_parser("run")
-    run_parser.add_argument("game_name", help="A specific game name (see ls) or 'all'."
-                                              " Important: 'all' only allows self-play for now. For this mode pass"
-                                              " only single model names e.g. model-a and then this will automatically"
-                                              " be expanded to model-a--model-a for multi-player games.")
-    run_parser = sub_parsers.add_parser("score")
-    run_parser.add_argument("game_name", help="A specific game name (see ls) or 'all'")
-    run_parser = sub_parsers.add_parser("transcribe")
-    run_parser.add_argument("game_name", help="A specific game name (see ls) or 'all'")
+
+    run_parser = sub_parsers.add_parser("run", formatter_class=argparse.RawTextHelpFormatter)
+    run_parser.add_argument("-m", "--models", type=str, nargs="*",
+                            help="""Assumes model names supported by the implemented backends.
+
+      To run a specific game with a single player:
+      $> python3 scripts/cli.py run -g privateshared -m mock
+
+      To run a specific game with a two players:
+      $> python3 scripts/cli.py run -g taboo -m mock mock
+
+      If the game supports model expansion (using the single specified model for all players):
+      $> python3 scripts/cli.py run -g taboo -m mock
+
+      When this option is not given, then the dialogue partners configured in the experiment are used. 
+      Default: None.""")
+    run_parser.add_argument("-t", "--temperature", type=float, default=0.0,
+                            help="Argument to specify sampling temperature for the models. Default: 0.0.")
+    run_parser.add_argument("-e", "--experiment_name", type=str,
+                            help="Optional argument to only run a specific experiment")
+    run_parser.add_argument("-g", "--game", type=str,
+                            required=True, help="A specific game name (see ls).")
+
+    score_parser = sub_parsers.add_parser("score")
+    score_parser.add_argument("-e", "--experiment_name", type=str,
+                              help="Optional argument to only run a specific experiment")
+    score_parser.add_argument("-g", "--game", type=str,
+                              help="A specific game name (see ls).", default="all")
+
+    transcribe_parser = sub_parsers.add_parser("transcribe")
+    transcribe_parser.add_argument("-e", "--experiment_name", type=str,
+                                   help="Optional argument to only run a specific experiment")
+    transcribe_parser.add_argument("-g", "--game", type=str,
+                                   help="A specific game name (see ls).", default="all")
+
     args = parser.parse_args()
     main(args)