Source code for llmtuner.infogetter.wiki

from llmtuner.infogetter.infobase import InfoBase

[docs] class WikiTalker(InfoBase): """Class to retrieve entry information from MediaWiki.""" def __init__(self, config = ""): super().__init__("wiki", config)
[docs] def list_pages(self, queryparams = {}, get_all_pages = False, get_extended_params = True, write_to_db = False): """Function to get pages from mediawiki by handing a dictionary containing. If 'get_all_pages' is True (default False) it will iterate through all paginated pages and return not only the first page, if 'get_extended_params' is True (default), it will also fetch page statistics, if 'write_to_db' is True (default False) it will directly write the entries to the document index. """ pages_list = [] if get_all_pages and not "aplimit" in queryparams: queryparams.setdefault("aplimit", "max") self._get("","", queryparams, True) if self.response.ok: if get_all_pages: while True: self._get("", "", queryparams, True) pages_list.extend(self.response.json()["query"]["allpages"]) if "continue" in self.response.json(): queryparams["apcontinue"] = self.response.json()["continue"]["apcontinue"] else: break else: pages_list.extend(self.response.json()["query"]["allpages"]) # return extended parameters in bunches of 50 which is the maximum number batchsize = 50 pages_extended = [] if get_extended_params: for i in range(0, len(pages_list), batchsize): # Get the current batch of page IDs page_ids = "|".join(str(page["pageid"]) for page in pages_list[i:i + batchsize]) # Parameters for fetching detailed information detail_params = { "action": "query", "format": "json", "pageids": page_ids, "prop": "info", # Retrieve page info "inprop": "protection|talkid|watched|subjectid|url|displaytitle|preload|varianttitles" # Extended properties } self._get("", "", detail_params, True) if self.response.ok: pages_extended.extend(self.response.json()["query"]["pages"].values()) else: pages_extended = pages_list if write_to_db: for entry in pages_extended: self.write_to_index(entry) return pages_extended
[docs] def write_to_index(self, entry, update = True): """Store information to DB table""" if not "pageid" in entry or not "fullurl" in entry: print ("Cannot write entry without pageid and fullurl. Retrieve extended parameters from Wiki to the this information.") docindex_params = { "url": entry["fullurl"], "docid": "wiki:"+str(entry["pageid"]), "sourceid": "wiki", "sourceformat": "text/html", "download": "webpage" } wiki_params = { "docid": "wiki:"+str(entry["pageid"]), "title": entry["title"], "lastupdateDate": entry["touched"], "length": entry["length"] } self.config.store.add_data_to_table("documentindex", docindex_params, ["docid"], update = update) self.config.store.add_data_to_table("wiki", wiki_params, ["docid"], update = update)