#!/usr/bin/env python3 import json import pathlib import re import typing import bs4 import requests BIRTHDAYS_JSON = pathlib.Path('mco-birthdays.json') USERS_URL_TEMPLATE = 'https://minecraftonline.com/w/index.php?title=Special:ListUsers&limit={limit}&offset={offset}' SAVE_CHUNK_SIZE = 10 BIRTHDAY_REGEXES = { part: re.compile(f'birthday{part}\s*=\s*([^|]+)') for part in ('year', 'month', 'day') } USER_REGEX = re.compile('User:([^&]+)') OFFSET_REGEX = re.compile('offset=([^&]+)') T_Birthday = dict[str, int] MONTHS = ( 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december' ) def find_month(query: str) -> typing.Optional[int]: # very inefficient but quick to write for i, month in enumerate(MONTHS): if month[:len(query)] == query.lower(): return i + 1 def extract_birthday(source: typing.Union[str, bytes]) -> T_Birthday: result = dict() if type(source) is bytes: source = source.decode() for part, regex in BIRTHDAY_REGEXES.items(): if match := regex.search(source): match_result = match.group(1).strip() if match_result.isnumeric(): result[part] = int(match_result) elif month := find_month(match_result): result[part] = month return result def retrieve_birthday(username: str) -> typing.Optional[T_Birthday]: url = f'https://minecraftonline.com/w/index.php?title=User:{username}&action=edit' print(f'retrieving {url}') response = requests.get(url) try: response.raise_for_status() except requests.exceptions.HTTPError: return return extract_birthday(response.content) def retrieve_users_with_pages(limit_per_page: int = 500) -> list[str]: offset = '' results = [] while True: url = USERS_URL_TEMPLATE.format(limit=limit_per_page, offset=offset) print(f'retrieving {url}') response = requests.get(url) response.raise_for_status() soup = bs4.BeautifulSoup(response.content, 'lxml') results += [ USER_REGEX.search(a['href']).group(1) for a in soup.select('a.mw-userlink:not(.new)') ] next_link = soup.select_one(".mw-nextlink") if next_link: offset = OFFSET_REGEX.search(next_link['href']).group(1) else: break return results def main(): birthdays = dict() if BIRTHDAYS_JSON.is_file(): with BIRTHDAYS_JSON.open('r') as fp: birthdays = json.load(fp) users_with_pages = retrieve_users_with_pages() # abt 40 requests users_to_retrieve = sorted(set(users_with_pages) - set(birthdays.keys())) print(f'{len(users_to_retrieve)} birthdays to attempt to retrieve ({len(birthdays)} already saved)') for i, username in enumerate(users_to_retrieve): result = retrieve_birthday(username) if len(result) > 0: print(result) birthdays[username] = retrieve_birthday(username) if i % SAVE_CHUNK_SIZE == 0: with BIRTHDAYS_JSON.open('w') as fp: json.dump(birthdays, fp, indent=2) with BIRTHDAYS_JSON.open('w') as fp: json.dump(birthdays, fp, indent=2) if __name__ == '__main__': main()