import logging
import json
import csv
import evaluate
from llmtuner.config import LLMTunerConfig
from llmtuner.llms.anyllm import AnyLLMChatter
[docs]
class Evaluator:
"""Class to perform LLM evaluation for a given workspace."""
def __init__(self, testname, workspace = "", datasetname = "", description = ""):
[docs]
self.spaceslug = workspace
self._set_chatter()
self._set_dataset(datasetname)
[docs]
self.testname = testname
[docs]
self.description = description
[docs]
def _set_chatter(self, spaceslug = ""):
"""Set chatter for evaluation by workspace slugname."""
if spaceslug:
self.spaceslug = spaceslug
logging.debug("Setting workspace slug to "+self.spaceslug)
if self.spaceslug:
self.chatter = AnyLLMChatter(self.spaceslug)
else:
logging.warning("You did not define a workspace slug. Evaluator will not work.")
self.chatter = ""
[docs]
def _set_dataset(self, datasetname = ""):
"""Set dataset for evaluation"""
if datasetname:
self.dataset = EvalData(datasetname)
else:
self.dataset = False
logging.info("Dataset could not be set. Please provide a dataset name.")
[docs]
def set_description(self, description):
"""Set description for the test"""
self.description = description
[docs]
def _write_test(self, testname = "", description = "", overwrite = False):
"""Create a new test instance"""
# need test ID
if testname:
self.testname = testname
if not self.testname:
logging.warning("You did not provide a testname. Cannot write test results to database.")
if not self.chatter:
logging.warning("No workspace set. Cannot write test results to database.")
if not self.dataset:
logging.warning("No evaluation dataset defined. Cannot write test data to database.")
if self.testname and self.chatter and self.dataset:
if "responses" in self.results:
for entry in self.results["responses"]:
entry.setdefault("test_id", self.testname)
self.dataset.config.store.add_data_to_table("testresponses", entry, ["test_id", "question_id"])
logging.debug("Added "+str(len(self.results["responses"]))+" to the database.")
else:
logging.warning("Did not find chat responses for test, cannot write test results to database.")
self.chatter.workspace._get_metadata_from_server(True)
testconfig = {"test_id": self.testname,
"description": description,
"testdata_label": self.dataset.setname,
"workspace_metadata": json.dumps(self.chatter.workspace.meta),
"method_name": None,
"method_config": None,
}
wrote_one_config = False
for methodname in self.results:
if methodname != "responses":
logging.debug("Writing results for method "+methodname)
testconfig["method_name"] = methodname
if "configs" in self.results[methodname]:
testconfig["method_config"] = json.dumps(self.results[methodname]["configs"])
logging.debug("Including config "+str(testconfig["method_config"]))
self.dataset.config.store.add_data_to_table("testconfigurations", testconfig)
wrote_one_config = True
for resultname in self.results[methodname]:
logging.debug("Writing result "+resultname)
if resultname != "configs":
if len(self.results["responses"]) == len(self.results[methodname][resultname]):
for i in range(len(self.results["responses"])):
result = {"test_id": self.testname,
"question_id": self.results["responses"][i]["question_id"],
"method": methodname,
"parameter": resultname,
"result": self.results[methodname][resultname][i]
}
self.dataset.config.store.add_data_to_table("testresults", result)
else:
result = {"test_id": self.testname,
"question_id": "all",
"method": methodname,
"parameter": resultname,
"result": ""
}
if type(self.results[methodname][resultname]) is list and len(self.results[methodname][resultname]==1):
result["result"] = self.results[methodname][resultname][0]
elif type(self.results[methodname][resultname]) is str:
result["result"] = self.results[methodname][resultname]
else:
logging.warning("Don't know what to do with result "+methodname+" of type "+str(type(self.results[methodname][resultname])))
if result["result"]:
self.dataset.config.store.add_data_to_table("testresults", result)
if not wrote_one_config:
self.dataset.config.store.add_data_to_table("testconfigurations", testconfig)
logging.info("Did not find evaluation for test "+self.testname+". Writing test configuration without methods and results.")
[docs]
def retrieve_responses(self, redo = False, write_to_db = True):
"""Create response for questions in the dataset"""
if not self.dataset:
logging.error("Cannot get responses without questions. Please add questions to the dataset first.")
else:
storedresponses = self.dataset.config.store.get_data_from_table("testresponses",
{"test_id": self.testname,
"question_id": [question["question_id"] for question in self.dataset.data]},
asdicts = True)
if (not "responses" in self.results or redo) and len(storedresponses) != len(self.dataset.data):
responses = []
for question in self.dataset.data:
responses.append({"question_id": question["question_id"],
"response": self.chatter.chat(question["question"])}
)
self.results.setdefault("responses", responses)
elif "responses" in self.results:
logging.info("Responses for test data already retrieved, will not redo the queries.")
elif len(storedresponses) == len(self.dataset.data):
self.results.setdefault("responses", storedresponses)
logging.info("Getting responses for this test from database.")
else:
logging.info("Don't know why ended up here. Did not set responses.")
if write_to_db:
self._write_test()
[docs]
def evaluate(self, methods = [], configs = {}, write_to_db = True):
"""Perform evaluation according to a given method. Can pass extra configs for each method as dictionary."""
if not methods:
logging.warning("You cannot evaluate the llm without providing an evaluation method. Please pass a method!")
if not "responses" in self.results:
logging.info("You did not yet create responses from the workspace. Will create responses now.")
self.retrieve_responses()
for method in methods:
if method in ["bertscore"]:
evaluator = evaluate.load(method)
model_type = 'bert-base-uncased'
language = "en"
rescale = True
if "bertscore" in configs:
if "model_type" in configs["bertscore"]:
model_type = configs["bertscore"]["model_type"]
if "language" in configs["bertscore"]:
language = configs["bertscore"]["language"]
if "rescale" in configs["bertscore"]:
rescale = configs["bertscore"]["rescale"]
bertconfig = {"model_type": model_type,
"language": language,
"rescale": rescale}
outcome = evaluator.compute(predictions=[result["response"] for result in self.results["responses"]],
references=[question["reference"] for question in self.dataset.data],
model_type=model_type,
lang=language,
rescale_with_baseline=rescale
)
outcome.setdefault("configs", bertconfig)
self.results.setdefault("bertscore", outcome)
if write_to_db:
self._write_test()
[docs]
class EvalData:
def __init__(self, setname = "", config = ""):
if config == "":
self.config = LLMTunerConfig()
elif type(config) == str:
self.config = LLMTunerConfig(config)
elif type(config) == LLMTunerConfig:
self.config = config
else:
logging.warning("Did not know how to handle configuration that you passed to class. "+
"Please provide filepath to configuration or a LLMTunerConfig object.")
[docs]
self.datatablename = "testdata"
self.load_from_db(setname)
[docs]
def load_from_db(self, setname = ""):
"""Loading dataset from DB"""
if setname:
self.setname = setname
data = self.config.store.get_data_from_table(self.datatablename, {"labels": "select:"+setname}, True)
if len(data) == 0:
logging.info("Did not find any entry in DB to match setname "+setname+". Have fun creating a new one!")
else:
self.data = data
else:
logging.warning("You are initiating an Evaluation Dataset without setname. Please provide a dataset name!")
[docs]
def load_from_file(self, filepath, setname = "", write_to_db = False):
"""Loading dataset from local file. Added directly to database if 'write_to_db' is True.
Required columns are 'question_id', 'question' and 'reference'."""
with open(filepath, mode='r', newline='', encoding='utf-8') as file:
reader = csv.DictReader(file)
for row in reader:
gotall = True
for key in ['question_id', 'question','reference']:
if not key in row.keys():
gotall = False
if gotall:
unique = True
for data in self.data:
if row["question_id"] == data["question_id"]:
unique = False
logging.info("Question with ID "+row["question_id"]+" already in dataset, will not add.")
if unique:
self.data.append(row)
if setname:
self.setname = setname
if write_to_db:
self._write_to_db(origin = "file:"+filepath)
[docs]
def _write_to_db(self, setname = "", origin = ""):
"""Write current data to local DB."""
for info in self.data:
info.setdefault("origin", origin)
info.setdefault("labels", [self.setname])
self.config.store.add_data_to_table(self.datatablename, info, "question_id", True)
[docs]
class EvalResult:
"""Class to hold outcome of chatter evaluation"""
def __init__(self, testname):