Source code for llmtuner.infogetter.infobase

"""Get documents from external sources, perhaps preprocess content"""

from datetime import datetime
import mimetypes

from llmtuner.interface import Interface

[docs] class InfoBase(Interface): """Basic class to retrieve entry information from external source. If you pass a LLMTunerConfig, you can store results in the DB. Can be initialized passing a baseurl and an identifier for the source.""" def __init__(self, interfaceid = "", config = "", url = ""): super().__init__(interfaceid, config) if url: self.baseurl = url self.update_config({"base_url": url})
[docs] self.response = None
@staticmethod
[docs] def get_timestamp(message = ""): """Get an entry to add to the history of data processing""" return str(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " || " + message)
@staticmethod
[docs] def get_mimetype(filename = ""): """Get the mimetype from a filename""" mime_type, _ = mimetypes.guess_type(filename) return mime_type or "text"
[docs] def download(self, docids): """Download files or webpages to local storage from external URL using its docid""" rows = self.config.store.get_data_from_table("documentindex", {"docid": docids}, True) for docinfo in rows: stagingentry = { "url": docinfo["url"], "status": "listed", "filepath": "", "docid": docinfo["docid"], "content": "", "history": [], "keywords": [], "grouping": [], "contentformat": "" } self._get(docinfo["url"]) if self.response.ok: if docinfo["download"] == "webpage": stagingentry["content"] = self.response.content stagingentry["contentformat"] = "text/html" stagingentry["status"] = "retrieved" stagingentry["history"].append(self.get_timestamp("retrieved")) elif docinfo["download"] == "file": filename = docinfo["url"].split("/")[-1] filepath = self.config.store.write_local(self.response.content, filename) stagingentry["contentformat"] = self.get_mimetype(filename) stagingentry["status"] = "retrieved" stagingentry["filepath"] = filepath stagingentry["history"].append(str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + " retrieved") else: print ("Do not know how to deal with download type '"+docinfo["download"]+"'. Choose another one.") self.config.store.add_data_to_table("stagingdocuments", stagingentry, ["docid"], update = True)