#!/usr/bin/env python # -*- coding: utf-8 -*- import requests from bs4 import BeautifulSoup import json import re def add_operator(mcc, mnc, brand, operator, status, country, country_code, db): assert re.match('^\d{3}$', mcc) assert re.match('^\d{2,3}$', mnc) if mcc not in db: db[mcc] = {} # assert mnc not in db[mcc] db[mcc][mnc] = { 'brand': brand, 'operator': operator, 'country': country, 'countryCode': country_code, 'status': status } def scan_table(table, country, country_code, db): rows = table.find_all('tr') hdr = rows.pop(0).find_all('th') assert hdr[0].text == u'MCC' assert hdr[1].text == u'MNC' assert hdr[2].text == u'Brand' assert hdr[3].text == u'Operator' assert hdr[4].text == u'Status' for row in rows: td = row.find_all('td') mcc = td[0].text mnc = td[1].text brand = td[2].text.replace('[citation needed]', '') operator = td[3].text.replace('[citation needed]', '') status = td[4].text if mcc and mnc and '?' not in mnc: if '-' in mnc: # TODO: mnc range pass else: add_operator(mcc, mnc, brand, operator, status, country, country_code, db) def contains_headline(tag): return tag.find(class_='mw-headline') is not None def main(): db = {} soup = BeautifulSoup(requests.get('https://en.wikipedia.org/wiki/Mobile_country_code').text, 'xml') for th in soup.find_all('th', text='MCC'): table = th.find_parent('table') tab_title = table.find_previous_sibling(contains_headline).find(class_='mw-headline').findAll(text=True) tab_title = ''.join(tab_title).split(' - ') assert (len(tab_title) == 1) or (len(tab_title) == 2) country = tab_title.pop(0) country_code = ''.join(tab_title) scan_table(table, country, country_code, db) with open('mccmnc.json', 'w') as f: json.dump(db, f, indent=4, sort_keys=True) if __name__ == '__main__': main()