Source code for llmtuner.llms.evaluation

import logging
import json
import csv

import evaluate

from llmtuner.config import LLMTunerConfig
from llmtuner.llms.anyllm import AnyLLMChatter

[docs] class Evaluator: """Class to perform LLM evaluation for a given workspace.""" def __init__(self, testname, workspace = "", datasetname = "", description = ""):
[docs] self.spaceslug = workspace
self._set_chatter() self._set_dataset(datasetname)
[docs] self.testname = testname
[docs] self.description = description
[docs] self.results = {}
[docs] def _set_chatter(self, spaceslug = ""): """Set chatter for evaluation by workspace slugname.""" if spaceslug: self.spaceslug = spaceslug logging.debug("Setting workspace slug to "+self.spaceslug) if self.spaceslug: self.chatter = AnyLLMChatter(self.spaceslug) else: logging.warning("You did not define a workspace slug. Evaluator will not work.") self.chatter = ""
[docs] def _set_dataset(self, datasetname = ""): """Set dataset for evaluation""" if datasetname: self.dataset = EvalData(datasetname) else: self.dataset = False logging.info("Dataset could not be set. Please provide a dataset name.")
[docs] def set_description(self, description): """Set description for the test""" self.description = description
[docs] def _write_test(self, testname = "", description = "", overwrite = False): """Create a new test instance""" # need test ID if testname: self.testname = testname if not self.testname: logging.warning("You did not provide a testname. Cannot write test results to database.") if not self.chatter: logging.warning("No workspace set. Cannot write test results to database.") if not self.dataset: logging.warning("No evaluation dataset defined. Cannot write test data to database.") if self.testname and self.chatter and self.dataset: if "responses" in self.results: for entry in self.results["responses"]: entry.setdefault("test_id", self.testname) self.dataset.config.store.add_data_to_table("testresponses", entry, ["test_id", "question_id"]) logging.debug("Added "+str(len(self.results["responses"]))+" to the database.") else: logging.warning("Did not find chat responses for test, cannot write test results to database.") self.chatter.workspace._get_metadata_from_server(True) testconfig = {"test_id": self.testname, "description": description, "testdata_label": self.dataset.setname, "workspace_metadata": json.dumps(self.chatter.workspace.meta), "method_name": None, "method_config": None, } wrote_one_config = False for methodname in self.results: if methodname != "responses": logging.debug("Writing results for method "+methodname) testconfig["method_name"] = methodname if "configs" in self.results[methodname]: testconfig["method_config"] = json.dumps(self.results[methodname]["configs"]) logging.debug("Including config "+str(testconfig["method_config"])) self.dataset.config.store.add_data_to_table("testconfigurations", testconfig) wrote_one_config = True for resultname in self.results[methodname]: logging.debug("Writing result "+resultname) if resultname != "configs": if len(self.results["responses"]) == len(self.results[methodname][resultname]): for i in range(len(self.results["responses"])): result = {"test_id": self.testname, "question_id": self.results["responses"][i]["question_id"], "method": methodname, "parameter": resultname, "result": self.results[methodname][resultname][i] } self.dataset.config.store.add_data_to_table("testresults", result) else: result = {"test_id": self.testname, "question_id": "all", "method": methodname, "parameter": resultname, "result": "" } if type(self.results[methodname][resultname]) is list and len(self.results[methodname][resultname]==1): result["result"] = self.results[methodname][resultname][0] elif type(self.results[methodname][resultname]) is str: result["result"] = self.results[methodname][resultname] else: logging.warning("Don't know what to do with result "+methodname+" of type "+str(type(self.results[methodname][resultname]))) if result["result"]: self.dataset.config.store.add_data_to_table("testresults", result) if not wrote_one_config: self.dataset.config.store.add_data_to_table("testconfigurations", testconfig) logging.info("Did not find evaluation for test "+self.testname+". Writing test configuration without methods and results.")
[docs] def retrieve_responses(self, redo = False, write_to_db = True): """Create response for questions in the dataset""" if not self.dataset: logging.error("Cannot get responses without questions. Please add questions to the dataset first.") else: storedresponses = self.dataset.config.store.get_data_from_table("testresponses", {"test_id": self.testname, "question_id": [question["question_id"] for question in self.dataset.data]}, asdicts = True) if (not "responses" in self.results or redo) and len(storedresponses) != len(self.dataset.data): responses = [] for question in self.dataset.data: responses.append({"question_id": question["question_id"], "response": self.chatter.chat(question["question"])} ) self.results.setdefault("responses", responses) elif "responses" in self.results: logging.info("Responses for test data already retrieved, will not redo the queries.") elif len(storedresponses) == len(self.dataset.data): self.results.setdefault("responses", storedresponses) logging.info("Getting responses for this test from database.") else: logging.info("Don't know why ended up here. Did not set responses.") if write_to_db: self._write_test()
[docs] def evaluate(self, methods = [], configs = {}, write_to_db = True): """Perform evaluation according to a given method. Can pass extra configs for each method as dictionary.""" if not methods: logging.warning("You cannot evaluate the llm without providing an evaluation method. Please pass a method!") if not "responses" in self.results: logging.info("You did not yet create responses from the workspace. Will create responses now.") self.retrieve_responses() for method in methods: if method in ["bertscore"]: evaluator = evaluate.load(method) model_type = 'bert-base-uncased' language = "en" rescale = True if "bertscore" in configs: if "model_type" in configs["bertscore"]: model_type = configs["bertscore"]["model_type"] if "language" in configs["bertscore"]: language = configs["bertscore"]["language"] if "rescale" in configs["bertscore"]: rescale = configs["bertscore"]["rescale"] bertconfig = {"model_type": model_type, "language": language, "rescale": rescale} outcome = evaluator.compute(predictions=[result["response"] for result in self.results["responses"]], references=[question["reference"] for question in self.dataset.data], model_type=model_type, lang=language, rescale_with_baseline=rescale ) outcome.setdefault("configs", bertconfig) self.results.setdefault("bertscore", outcome) if write_to_db: self._write_test()
[docs] class EvalData: def __init__(self, setname = "", config = ""): if config == "": self.config = LLMTunerConfig() elif type(config) == str: self.config = LLMTunerConfig(config) elif type(config) == LLMTunerConfig: self.config = config else: logging.warning("Did not know how to handle configuration that you passed to class. "+ "Please provide filepath to configuration or a LLMTunerConfig object.")
[docs] self.datatablename = "testdata"
[docs] self.data = []
self.load_from_db(setname)
[docs] def load_from_db(self, setname = ""): """Loading dataset from DB""" if setname: self.setname = setname data = self.config.store.get_data_from_table(self.datatablename, {"labels": "select:"+setname}, True) if len(data) == 0: logging.info("Did not find any entry in DB to match setname "+setname+". Have fun creating a new one!") else: self.data = data else: logging.warning("You are initiating an Evaluation Dataset without setname. Please provide a dataset name!")
[docs] def load_from_file(self, filepath, setname = "", write_to_db = False): """Loading dataset from local file. Added directly to database if 'write_to_db' is True. Required columns are 'question_id', 'question' and 'reference'.""" with open(filepath, mode='r', newline='', encoding='utf-8') as file: reader = csv.DictReader(file) for row in reader: gotall = True for key in ['question_id', 'question','reference']: if not key in row.keys(): gotall = False if gotall: unique = True for data in self.data: if row["question_id"] == data["question_id"]: unique = False logging.info("Question with ID "+row["question_id"]+" already in dataset, will not add.") if unique: self.data.append(row) if setname: self.setname = setname if write_to_db: self._write_to_db(origin = "file:"+filepath)
[docs] def _write_to_db(self, setname = "", origin = ""): """Write current data to local DB.""" for info in self.data: info.setdefault("origin", origin) info.setdefault("labels", [self.setname]) self.config.store.add_data_to_table(self.datatablename, info, "question_id", True)
[docs] class EvalResult: """Class to hold outcome of chatter evaluation""" def __init__(self, testname):
[docs] self.testname = ""