"""Get documents from external sources, perhaps preprocess content"""
from datetime import datetime
import mimetypes
from llmtuner.interface import Interface
[docs]
class InfoBase(Interface):
"""Basic class to retrieve entry information from external source. If you pass a LLMTunerConfig, you can store results in the DB.
Can be initialized passing a baseurl and an identifier for the source."""
def __init__(self, interfaceid = "", config = "", url = ""):
super().__init__(interfaceid, config)
if url:
self.baseurl = url
self.update_config({"base_url": url})
@staticmethod
[docs]
def get_timestamp(message = ""):
"""Get an entry to add to the history of data processing"""
return str(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " || " + message)
@staticmethod
[docs]
def get_mimetype(filename = ""):
"""Get the mimetype from a filename"""
mime_type, _ = mimetypes.guess_type(filename)
return mime_type or "text"
[docs]
def download(self, docids):
"""Download files or webpages to local storage from external URL using its docid"""
rows = self.config.store.get_data_from_table("documentindex", {"docid": docids}, True)
for docinfo in rows:
stagingentry = {
"url": docinfo["url"], "status": "listed", "filepath": "", "docid": docinfo["docid"],
"content": "", "history": [], "keywords": [], "grouping": [], "contentformat": ""
}
self._get(docinfo["url"])
if self.response.ok:
if docinfo["download"] == "webpage":
stagingentry["content"] = self.response.content
stagingentry["contentformat"] = "text/html"
stagingentry["status"] = "retrieved"
stagingentry["history"].append(self.get_timestamp("retrieved"))
elif docinfo["download"] == "file":
filename = docinfo["url"].split("/")[-1]
filepath = self.config.store.write_local(self.response.content, filename)
stagingentry["contentformat"] = self.get_mimetype(filename)
stagingentry["status"] = "retrieved"
stagingentry["filepath"] = filepath
stagingentry["history"].append(str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + " retrieved")
else:
print ("Do not know how to deal with download type '"+docinfo["download"]+"'. Choose another one.")
self.config.store.add_data_to_table("stagingdocuments", stagingentry, ["docid"], update = True)