"""Get list of papers from arxiv and parse to local db"""
from llmtuner.infogetter.infobase import InfoBase
import xml.etree.ElementTree as ET
[docs]
class ArxivTalker(InfoBase):
"""Class to retrieve entry information from ArXiv."""
def __init__(self, config = ""):
super().__init__("arxiv", config)
@staticmethod
[docs]
def _get_entry(content):
root = ET.fromstring(content)
# Define the namespaces
namespaces = {
"atom": "http://www.w3.org/2005/Atom",
"arxiv": "http://arxiv.org/schemas/atom"
}
# Extract entry details and store in DataFrame
entries = []
for entry in root.findall("atom:entry", namespaces):
url = next((link.attrib["href"] for link in entry.findall("atom:link", namespaces) if link.attrib.get("type") == "text/html"), None)
aid = url.split("/")[-1]
entry_data = {
"title": entry.find("atom:title", namespaces).text,
"authors": [author.find("atom:name", namespaces).text for author in entry.findall("atom:author", namespaces)],
"published": entry.find("atom:published", namespaces).text,
"doi": next((link.attrib["href"] for link in entry.findall("atom:link", namespaces) if link.attrib.get("title") == "doi"), None),
"category": next((category.attrib["term"] for category in entry.findall("atom:category", namespaces)), None),
"path": url,
"pdf": next((link.attrib["href"] for link in entry.findall("atom:link", namespaces) if link.attrib.get("title") == "pdf"), None),
}
if entry.find("atom:updated", namespaces):
entry_data["updated"] = entry.find("atom:updated", namespaces).text
if entry.find("atom:id", namespaces):
entry_data["id"] = entry.find("atom:id", namespaces).text
if entry.find("atom:summary", namespaces):
entry_data["abstract"] = entry.find("atom:summary", namespaces).text
if entry.find("arxiv:comment", namespaces):
entry_data["comment"] = entry.find("arxiv:comment", namespaces).text
entries.append(entry_data)
return entries
[docs]
def list_entries(self, query, nentries = 100, write_to_index = False):
"""Get a list of arxiv entries according to a query term, e.g. 'ANTARES'. Can limit number of entries.
If you provide a LLMTunerConfig object to the talker, the results will be written to the database.
"""
params = {
"search_query": "all:"+query,
"start": 0,
"max_results": nentries
}
self._get("", params=params)
# Parse the XML response
entries = self._get_entry(self.response.content)
if self.config:
store = self.config.get_store()
store.keepopen = True
for entry in entries:
self.write_to_index(entry)
store._close()
else:
return entries
[docs]
def write_to_index(self, entry):
"""Function to match entries from arxiv to database list of entries"""
docid = "arxiv:"+entry["path"][entry["path"].rfind("/")+1:len(entry["path"])]
arxiv_data = {"docid": docid}
link_data = {
"docid": docid,
"sourcename": "arxiv",
"contenttype": "article",
"doctype": "pdf"
}
data_to_arxiv_keys = ["title", "authors", "published", "updated", "abstract", "comment", "doi", "category"]
for key in data_to_arxiv_keys:
if key in entry:
arxiv_data.setdefault(key, entry[key])
data_to_link_keys = {
"path": "url",
"pdf": "download"
}
for key_entry, key_link in data_to_link_keys.items():
link_data.setdefault(key_link, entry[key_entry])
self.config.store.add_data_to_table("arxiv", arxiv_data, ["docid"])
self.config.store.add_data_to_table("stagingdocuments", link_data, ["docid"])
[docs]
def download(self, docid):
"""Function for download of a single entry to embed in the vector database"""
pass