Source code for llmtuner.infogetter.arxiv

"""Get list of papers from arxiv and parse to local db"""

from llmtuner.infogetter.infobase import InfoBase

import xml.etree.ElementTree as ET
                

[docs]
class ArxivTalker(InfoBase):
    """Class to retrieve entry information from ArXiv."""
    def __init__(self, config = ""):
        super().__init__("arxiv", config)

    @staticmethod

[docs]
    def _get_entry(content):
        root = ET.fromstring(content)

        # Define the namespaces
        namespaces = {
            "atom": "http://www.w3.org/2005/Atom",
            "arxiv": "http://arxiv.org/schemas/atom"
        }

        # Extract entry details and store in DataFrame
        entries = []
        for entry in root.findall("atom:entry", namespaces):
            url = next((link.attrib["href"] for link in entry.findall("atom:link", namespaces) if link.attrib.get("type") == "text/html"), None)
            aid = url.split("/")[-1]
            entry_data = {
                "title": entry.find("atom:title", namespaces).text,
                "authors": [author.find("atom:name", namespaces).text for author in entry.findall("atom:author", namespaces)],
                "published": entry.find("atom:published", namespaces).text,
                "doi": next((link.attrib["href"] for link in entry.findall("atom:link", namespaces) if link.attrib.get("title") == "doi"), None),
                "category": next((category.attrib["term"] for category in entry.findall("atom:category", namespaces)), None),
                "path": url,
                "pdf": next((link.attrib["href"] for link in entry.findall("atom:link", namespaces) if link.attrib.get("title") == "pdf"), None),
            }
            if entry.find("atom:updated", namespaces):
                entry_data["updated"] = entry.find("atom:updated", namespaces).text
            if entry.find("atom:id", namespaces):
                entry_data["id"] = entry.find("atom:id", namespaces).text
            if entry.find("atom:summary", namespaces):
                entry_data["abstract"] = entry.find("atom:summary", namespaces).text
            if entry.find("arxiv:comment", namespaces):
                entry_data["comment"] = entry.find("arxiv:comment", namespaces).text
            entries.append(entry_data)

        return entries

    

[docs]
    def list_entries(self, query, nentries = 100, write_to_index = False):
        """Get a list of arxiv entries according to a query term, e.g. 'ANTARES'. Can limit number of entries.
        If you provide a LLMTunerConfig object to the talker, the results will be written to the database.
        
        """
        params = {
            "search_query": "all:"+query,
            "start": 0,
            "max_results": nentries
        }

        self._get("", params=params)

        # Parse the XML response
        entries = self._get_entry(self.response.content)

        if self.config:
            store = self.config.get_store()
            store.keepopen = True
            for entry in entries:
                self.write_to_index(entry)
            store._close()
        else:
            return entries



[docs]
    def write_to_index(self, entry):
        """Function to match entries from arxiv to database list of entries"""
        docid = "arxiv:"+entry["path"][entry["path"].rfind("/")+1:len(entry["path"])]
        
        arxiv_data = {"docid": docid}
        link_data = {
            "docid": docid,
            "sourcename": "arxiv",
            "contenttype": "article",
            "doctype": "pdf"
        }
    
        data_to_arxiv_keys = ["title", "authors", "published", "updated", "abstract", "comment", "doi", "category"]
    
        for key in data_to_arxiv_keys:
            if key in entry:
                arxiv_data.setdefault(key, entry[key])
    
        data_to_link_keys = {
            "path": "url",
            "pdf": "download"
        }
    
        for key_entry, key_link in data_to_link_keys.items():
            link_data.setdefault(key_link, entry[key_entry])
    
        self.config.store.add_data_to_table("arxiv", arxiv_data, ["docid"])
        self.config.store.add_data_to_table("stagingdocuments", link_data, ["docid"])



[docs]
    def download(self, docid):
        """Function for download of a single entry to embed in the vector database"""
        pass