Source code for llmtuner.infogetter.arxiv

"""Get list of papers from arxiv and parse to local db"""

from llmtuner.infogetter.infobase import InfoBase

import xml.etree.ElementTree as ET
                
[docs] class ArxivTalker(InfoBase): """Class to retrieve entry information from ArXiv.""" def __init__(self, config = ""): super().__init__("arxiv", config) @staticmethod
[docs] def _get_entry(content): root = ET.fromstring(content) # Define the namespaces namespaces = { "atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom" } # Extract entry details and store in DataFrame entries = [] for entry in root.findall("atom:entry", namespaces): url = next((link.attrib["href"] for link in entry.findall("atom:link", namespaces) if link.attrib.get("type") == "text/html"), None) aid = url.split("/")[-1] entry_data = { "title": entry.find("atom:title", namespaces).text, "authors": [author.find("atom:name", namespaces).text for author in entry.findall("atom:author", namespaces)], "published": entry.find("atom:published", namespaces).text, "doi": next((link.attrib["href"] for link in entry.findall("atom:link", namespaces) if link.attrib.get("title") == "doi"), None), "category": next((category.attrib["term"] for category in entry.findall("atom:category", namespaces)), None), "path": url, "pdf": next((link.attrib["href"] for link in entry.findall("atom:link", namespaces) if link.attrib.get("title") == "pdf"), None), } if entry.find("atom:updated", namespaces): entry_data["updated"] = entry.find("atom:updated", namespaces).text if entry.find("atom:id", namespaces): entry_data["id"] = entry.find("atom:id", namespaces).text if entry.find("atom:summary", namespaces): entry_data["abstract"] = entry.find("atom:summary", namespaces).text if entry.find("arxiv:comment", namespaces): entry_data["comment"] = entry.find("arxiv:comment", namespaces).text entries.append(entry_data) return entries
[docs] def list_entries(self, query, nentries = 100, write_to_index = False): """Get a list of arxiv entries according to a query term, e.g. 'ANTARES'. Can limit number of entries. If you provide a LLMTunerConfig object to the talker, the results will be written to the database. """ params = { "search_query": "all:"+query, "start": 0, "max_results": nentries } self._get("", params=params) # Parse the XML response entries = self._get_entry(self.response.content) if self.config: store = self.config.get_store() store.keepopen = True for entry in entries: self.write_to_index(entry) store._close() else: return entries
[docs] def write_to_index(self, entry): """Function to match entries from arxiv to database list of entries""" docid = "arxiv:"+entry["path"][entry["path"].rfind("/")+1:len(entry["path"])] arxiv_data = {"docid": docid} link_data = { "docid": docid, "sourcename": "arxiv", "contenttype": "article", "doctype": "pdf" } data_to_arxiv_keys = ["title", "authors", "published", "updated", "abstract", "comment", "doi", "category"] for key in data_to_arxiv_keys: if key in entry: arxiv_data.setdefault(key, entry[key]) data_to_link_keys = { "path": "url", "pdf": "download" } for key_entry, key_link in data_to_link_keys.items(): link_data.setdefault(key_link, entry[key_entry]) self.config.store.add_data_to_table("arxiv", arxiv_data, ["docid"]) self.config.store.add_data_to_table("stagingdocuments", link_data, ["docid"])
[docs] def download(self, docid): """Function for download of a single entry to embed in the vector database""" pass