from llmtuner.infogetter.infobase import InfoBase
[docs]
class WikiTalker(InfoBase):
"""Class to retrieve entry information from MediaWiki."""
def __init__(self, config = ""):
super().__init__("wiki", config)
[docs]
def list_pages(self, queryparams = {}, get_all_pages = False, get_extended_params = True, write_to_db = False):
"""Function to get pages from mediawiki by handing a dictionary containing.
If 'get_all_pages' is True (default False) it will iterate through all paginated pages and return not only the first page,
if 'get_extended_params' is True (default), it will also fetch page statistics,
if 'write_to_db' is True (default False) it will directly write the entries to the document index.
"""
pages_list = []
if get_all_pages and not "aplimit" in queryparams:
queryparams.setdefault("aplimit", "max")
self._get("","", queryparams, True)
if self.response.ok:
if get_all_pages:
while True:
self._get("", "", queryparams, True)
pages_list.extend(self.response.json()["query"]["allpages"])
if "continue" in self.response.json():
queryparams["apcontinue"] = self.response.json()["continue"]["apcontinue"]
else:
break
else:
pages_list.extend(self.response.json()["query"]["allpages"])
# return extended parameters in bunches of 50 which is the maximum number
batchsize = 50
pages_extended = []
if get_extended_params:
for i in range(0, len(pages_list), batchsize):
# Get the current batch of page IDs
page_ids = "|".join(str(page["pageid"]) for page in pages_list[i:i + batchsize])
# Parameters for fetching detailed information
detail_params = {
"action": "query",
"format": "json",
"pageids": page_ids,
"prop": "info", # Retrieve page info
"inprop": "protection|talkid|watched|subjectid|url|displaytitle|preload|varianttitles" # Extended properties
}
self._get("", "", detail_params, True)
if self.response.ok:
pages_extended.extend(self.response.json()["query"]["pages"].values())
else:
pages_extended = pages_list
if write_to_db:
for entry in pages_extended:
self.write_to_index(entry)
return pages_extended
[docs]
def write_to_index(self, entry, update = True):
"""Store information to DB table"""
if not "pageid" in entry or not "fullurl" in entry:
print ("Cannot write entry without pageid and fullurl. Retrieve extended parameters from Wiki to the this information.")
docindex_params = {
"url": entry["fullurl"],
"docid": "wiki:"+str(entry["pageid"]),
"sourceid": "wiki",
"sourceformat": "text/html",
"download": "webpage"
}
wiki_params = {
"docid": "wiki:"+str(entry["pageid"]),
"title": entry["title"],
"lastupdateDate": entry["touched"],
"length": entry["length"]
}
self.config.store.add_data_to_table("documentindex", docindex_params, ["docid"], update = update)
self.config.store.add_data_to_table("wiki", wiki_params, ["docid"], update = update)