import urllib2 import json import re class SoccerwayTeamMatches: def __init__(self, teamId): self.teamId = str(teamId) self.data = {'all': [], 'home': [], 'away': []} def parseJson(self, jsonStr): """ Handles the parsing of the JSON object returned by Soccerway API (nr.soccerway.com/a/block_team_matches). Input: The JSON object returned by the API as a string Return: Two dimensional array of the match results """ jsonPy = json.loads(jsonStr) # Fetch the interesting part of inputted JSON obj content = jsonPy['commands'][0]['parameters']['content'] # Remove uninteresting header and footer data cleanContent = content.split('',1)[0].split(' -tags (tr is shorthand for table row) p1 = re.compile(r'') splitted = p1.split(cleanContent) header = splitted[1] # First row is the table header data data = splitted[2:-1] # Rest are the match info # Split content by -tags (table columns) and clean other tags p2 = re.compile(r'') f = lambda x: map(lambda y: re.sub('<[^<]+?>','',y).strip(), p2.split(x)[1:-2]) return map(f, data) def getData(self, matchType): """ Return the cleaned match data in 2D array. Does simple caching of the GET queries, that is, same data is not queried twice. Input: Type of the matches, must be one of the following strings: 'all', 'away' or 'home'. Output: 2d array of match results """ if matchType not in ['all','away','home']: return [] if not self.data[matchType]: url = "http://nr.soccerway.com/a/block_team_matches" \ "?block_id=page_team_1_block_team_matches_5" \ "&callback_params=%7B%22page%22%3A0%2C%22" \ "bookmaker_urls%22%3A%5B%5D%2C%22block_service_id" \ "%22%3A%22team_matches_block_teammatches%22%2C%22" \ "team_id%22%3A"+self.teamId+"%2C%22competition_id" \ "%22%3A0%2C%22filter%22%3A%22all%22%7D" \ "&action=filterMatches¶ms=%7B%22" \ "filter%22%3A%22"+matchType+"%22%7D" jsonStr = urllib2.urlopen(url).read() self.data[matchType] = self.parseJson(jsonStr) return self.data[matchType]