Deelvin · vvchernov · Sep 29, 2023 · Sep 29, 2023 · Sep 29, 2023 · Sep 29, 2023
diff --git a/README.md b/README.md
@@ -74,47 +74,3 @@ See details about other script features using help arg:
 ```bash
 python3 compare_models.py --help
 ```
-
-# README from the original github repo
-Due to the reason that the original repo do not allows to work with any other LLM from the box excluding ones from the paper it was forked and strongly refactored. But with regards to the authors of the initial work the text from their README is stayed here.
-
-## LMentry
-
-This repository contains the LMentry benchmark from [LMentry: A Language Model Benchmark of Elementary Language Tasks](https://arxiv.org/pdf/2211.02069.pdf), as well as the code to evaluate it.
-
-For any questions, feel free to open a GitHub issue or to contact us at avia.efrat@gmail.com :blush:
-
-### Getting the Data
-Simply clone the repo: 
-```shell
-git clone https://github.com/aviaefrat/lmentry.git
-```
-The data is in the `data` directory.
-
-### Generating Predictions
-We provide functions for generating predictions with Hugging Face and OpenAI models (see below), but you can generate predictions in any method of your choosing.
-
-For Hugging Face and OpenAI models, you can use the 
-`generate_all_hf_predictions` and 
-`generate_all_openai_predictions` functions from `predict.py`. These are what we used in our experiments.
-
-### Evaluation
-
-The easiest and recommended way is to use `evalutate.py`:
-```shell
-python -m lmentry.evaluate
-```
-_Don't forget to activate the lmentry environment (created from `environment.yml`) beforehand._  
-Using the `--num-procs=N` optional argument will score the predictions much faster.  
-`evalutate.py` will also automatically create files analyzing the results in a separate `results` dir.
-
-To use `evalutate.py`, the predictions must follow the same structure of [lmentry_predictions.zip](https://drive.google.com/file/d/1Ex1fde_PEzhIU5ctGkOJsacaGNnQeqke/view?usp=sharing) (if you used our functions from `predict.py`, your predictions already follow this structure):
-1. The top-level directory should be named `predictions`.
-2. `predictions` needs to contain exactly 41 directories, named after the 41 files in `data` (the 25 task names + the 16 files for the argument content robustness).
-3. Each of the 41 task predictions directories should contain a prediction file for each model you want to evaluate. For example, to evaluate the predictions of a model named `my-model`, each of the 41 directories should contain a file named `my-model.json` with the model's predictions for this task.
-4. Each predictions file should contain values in the form `"<id>": {"prediction": <prediction>},` where the `id`s correspond to those in the task's file in `data`.
-
-### Reproducing the Results from the Paper
-1. Clone the repository.
-2. Unzip `lmentry_predictions.zip` into the top-level lmentry directory.
-3. run `evaluate.py` (preferably with a not-very-small value for `--num-procs`, as there are 656 files to score...)
diff --git a/README_OLD.md b/README_OLD.md
@@ -0,0 +1,42 @@
+Due to the reason that the original repo do not allows to work with any other LLM and tasks from the box excluding ones from the paper it was forked and strongly refactored. But with regards to the authors of the initial work the text from their README is stayed here.
+
+# LMentry
+
+This repository contains the LMentry benchmark from [LMentry: A Language Model Benchmark of Elementary Language Tasks](https://arxiv.org/pdf/2211.02069.pdf), as well as the code to evaluate it.
+
+For any questions, feel free to open a GitHub issue or to contact us at avia.efrat@gmail.com :blush:
+
+## Getting the Data
+Simply clone the repo: 
+```shell
+git clone https://github.com/aviaefrat/lmentry.git
+```
+The data is in the `data` directory.
+
+## Generating Predictions
+We provide functions for generating predictions with Hugging Face and OpenAI models (see below), but you can generate predictions in any method of your choosing.
+
+For Hugging Face and OpenAI models, you can use the 
+`generate_all_hf_predictions` and 
+`generate_all_openai_predictions` functions from `predict.py`. These are what we used in our experiments.
+
+## Evaluation
+
+The easiest and recommended way is to use `evalutate.py`:
+```shell
+python -m lmentry.evaluate
+```
+_Don't forget to activate the lmentry environment (created from `environment.yml`) beforehand._  
+Using the `--num-procs=N` optional argument will score the predictions much faster.  
+`evalutate.py` will also automatically create files analyzing the results in a separate `results` dir.
+
+To use `evalutate.py`, the predictions must follow the same structure of [lmentry_predictions.zip](https://drive.google.com/file/d/1Ex1fde_PEzhIU5ctGkOJsacaGNnQeqke/view?usp=sharing) (if you used our functions from `predict.py`, your predictions already follow this structure):
+1. The top-level directory should be named `predictions`.
+2. `predictions` needs to contain exactly 41 directories, named after the 41 files in `data` (the 25 task names + the 16 files for the argument content robustness).
+3. Each of the 41 task predictions directories should contain a prediction file for each model you want to evaluate. For example, to evaluate the predictions of a model named `my-model`, each of the 41 directories should contain a file named `my-model.json` with the model's predictions for this task.
+4. Each predictions file should contain values in the form `"<id>": {"prediction": <prediction>},` where the `id`s correspond to those in the task's file in `data`.
+
+## Reproducing the Results from the Paper
+1. Clone the repository.
+2. Unzip `lmentry_predictions.zip` into the top-level lmentry directory.
+3. run `evaluate.py` (preferably with a not-very-small value for `--num-procs`, as there are 656 files to score...)
diff --git a/compare_models.py b/compare_models.py
@@ -4,7 +4,7 @@
 
 from lmentry.constants import PREDICTIONS_ROOT_DIR, TASKS_DATA_DIR, RESULTS_DIR, DEFAULT_MAX_LENGTH
 from tasks.task_utils import get_tasks_names, task_groups, all_tasks
-from lmentry.predict import generate_all_hf_predictions
+from lmentry.predict import PredictorFactory
 from lmentry.analysis.accuracy import flexible_scoring
 from lmentry.analysis.comparison import create_per_task_accuracy_comparison_csv
 from lmentry.model_manager import get_short_model_names
@@ -18,14 +18,16 @@ def parse_arguments():
   parser.add_argument("-r", "--ref_model_name", type=str, default="vicuna-7b-v1-3-q0f16",
                       help="Name of reference model. It is assumed that the model is original, "
                            "uses high-precision data type and has better accuracy")
-  parser.add_argument('-p', '--probe_model_names', nargs="+", type=str, default="vicuna-7b-v1-3-q4f16_0",
+  parser.add_argument('-p', '--probe_model_names', nargs="+", type=str, default=["vicuna-7b-v1-3-q4f16_0"],
                       help=f"Names of probe models. If the number of the probe models is bigger than one "
                            "it iteratively compares the reference model with each from the list.")
   parser.add_argument('-t', '--task_names', nargs="+", type=str, default=get_tasks_names("7b"),
                       help="If need to evaluate specified set of tasks set their names or name(s) of specified task set(s). "
                            f"Task set names should be from the list: {task_groups.keys()}. "
                            f"Task names should be from the list: {all_tasks}. "
                            "It tries to analyze 7b-model sensetive task set by default")
+  parser.add_argument("-pt", "--predictor_type", type=str, default="hf",
+                      help=f"Type of predictor, can be chosen from the list: {PredictorFactory.predictors_map.keys()}")
   parser.add_argument('-d', '--device', type=str, default="cuda",
                       help="Device name. It is needed and used by mlc model only during predictions")
   parser.add_argument('-b', '--batch_size', type=int, default=100,
@@ -42,10 +44,14 @@ def parse_arguments():
                       type=int,
                       help="The number of processes to use when scoring the predictions. "
                            "Can be up to the number of models you want to evaluate * 41.")
-  parser.add_argument("-f", "--forced_scoring", action="store_true", default=False,
-                      help="If scoring has been done for specified task it skips it. This flag allows to redo ready scoring")
+  parser.add_argument('-fp', '--force_predict', action="store_true", default=False,
+                      help="Whether to force regenerate predictions.")
+  parser.add_argument("-fs", "--force_scoring", action="store_true", default=False,
+                      help="If scoring has been done for specified task it will be skiped. This flag allows to redo ready scoring")
   parser.add_argument("-c", "--certainty", action="store_true", default=False,
                       help="Conservative accuracy evaluation. The answer is considered correct only if it is absolutely certain")
+  parser.add_argument('-uv', '--use_vllm', action='store_true', default=False,
+                      help="Whether to use vLLM inference.")
   return parser.parse_args()
 
 
@@ -61,41 +67,55 @@ def main():
   args = parse_arguments()
   task_names = get_tasks_names(args.task_names)
 
-  for probe_model_name in tqdm(args.probe_model_names, desc="Models comparison"):
-    model_names = get_short_model_names([args.ref_model_name, probe_model_name])
-    print(f"Models {model_names[0]} and {model_names[1]} are compared")
-
-    # Predict specified tasks for given models
-    # Reference model
-    logging.info(f"Prediction for {model_names[0]} model starts")
-    generate_all_hf_predictions(
-      task_names=task_names,
-      model_name=args.ref_model_name,
+  # Init predictor
+  predictor = PredictorFactory.get_predictor(
+      name=args.predictor_type,
       max_length=args.max_length,
       batch_size=args.batch_size,
-      device=args.device,
       samples_num=args.samples_num,
-    )
-    logging.info(f"Prediction for {model_names[0]} model finished")
-    # Probe_model
-    logging.info(f"Prediction for {model_names[1]} model starts")
-    generate_all_hf_predictions(
+  )
+
+  # Predict specified tasks for reference model
+  logging.info(f"Prediction for {args.ref_model_name} model starts")
+  predictor.generate(
+    task_names=task_names,
+    model_name=args.ref_model_name,
+    device=args.device,
+    use_vllm=args.use_vllm,
+    force_predict=args.force_predict,
+  )
+  logging.info(f"Prediction for {args.ref_model_name} model finished")
+
+  # Score reference model
+  flexible_scoring(
+    task_names=task_names,
+    model_names=[args.ref_model_name],
+    num_processes=args.num_procs,
+    forced_scoring=args.force_predict or args.force_scoring,
+  )
+
+  for probe_model_name in tqdm(args.probe_model_names, desc="Models comparison"):
+    print(f"Models {args.ref_model_name} and {probe_model_name} are compared")
+
+    # Predict specified tasks for probe model
+    logging.info(f"Prediction for {probe_model_name} model starts")
+    predictor.generate(
       task_names=task_names,
       model_name=probe_model_name,
-      max_length=args.max_length,
-      batch_size=args.batch_size,
       device=args.device,
-      samples_num=args.samples_num,
+      use_vllm=args.use_vllm,
+      force_predict=args.force_predict,
     )
-    logging.info(f"Prediction for {model_names[1]} model finished")
+    logging.info(f"Prediction for {probe_model_name} model finished")
 
     flexible_scoring(
       task_names=task_names,
-      model_names=model_names,
+      model_names=[probe_model_name],
       num_processes=args.num_procs,
       forced_scoring=args.forced_scoring,
     )
 
+    model_names = get_short_model_names([args.ref_model_name, probe_model_name])
     create_per_task_accuracy_comparison_csv(model_names=model_names, task_names=task_names, certainty=args.certainty)
 
 

diff --git a/lmentry/input_preprocessor.py b/lmentry/input_preprocessor.py
@@ -0,0 +1,7 @@
+class PromptPreprocessor:
+
+    def __init__(self) -> None:
+        pass
+
+    def preprocess(self, raw_input_prompts):
+        return raw_input_prompts
diff --git a/lmentry/output_postprocessor.py b/lmentry/output_postprocessor.py
@@ -0,0 +1,7 @@
+class PredictionPostprocessor:
+
+    def __init__(self) -> None:
+        pass
+
+    def postprocess(self, raw_output_prompts):
+        return raw_output_prompts