jskDr · March 15, 2020 05:42 · Mar 15, 2020 · Mar 15, 2020 · Mar 15, 2020
diff --git a/TieTacToe.py → tictactoe.py b/TieTacToe.py → tictactoe.py
@@ -228,6 +228,57 @@ def play(self, P_no):
 
         S = np.zeros((N_A,),dtype='int16') # #state == #action
 
+        if self.disp_flag:
+            print('S:', S)
+
+        done = False
+        while done == False:
+            action, done = self.get_action(P_no, S)
+            Buff['P_no'].append(P_no)
+            Buff['S'].append(S.copy())
+            Buff['a'].append(action)
+            set_state_inplace(S, action, P_no)    
+            Buff['S_next'].append(S.copy())   
+
+            if self.disp_flag:
+                print('S:', S)
+
+            win_player = calc_reward_tf(S)
+            reward = 0 if win_player == 0 else 1
+            Buff['r'].append(reward)
+            P_no = 1 if P_no == 2 else 2
+
+            if win_player:
+                done = True                
+
+        if self.disp_flag:        
+            if win_player:
+                print(f'player {win_player} win')
+            else:
+                print(f'Tie game')
+
+        return Buff    
+
+    def play_with_random(self, P_no):
+        """ 
+        Buff = play(self, P_no)
+        
+        [Inputs]  
+            P_no: player number, which is 1 or 2
+
+        [Returns]
+            Buff = {'P_no': [], 'S':[], 'a':[], 'r':[], 'S_next': []}: gathered information during learning
+                where S, a, r, S_next are state, action, rewrd, and next state
+
+        [Examples]
+            1. Buff = self.play(1)
+            2. Buff = self.play(2)
+        """
+        N_A = self.N_A
+        Buff = {'P_no': [], 'S':[], 'a':[], 'r':[], 'S_next': []}
+
+        S = np.zeros((N_A,),dtype='int16') # #state == #action
+
         if self.disp_flag:
             print('S:', S)
 
@@ -561,7 +612,7 @@ def plot_cnt_trace(cnt_trace):
     plt.ylabel('Count')
     plt.legend(loc=0)
     plt.title('Learned (P#1) vs. Random (P#2) policies during learning')
-    plt.show(True)
+    plt.show()
 
 
 def learning_stage(N_episodes=100, save_flag=True, fig_flag=False):

diff --git a/TieTacToe → TieTacToe.py b/TieTacToe → TieTacToe.py
diff --git a/TieTacToe b/TieTacToe
@@ -0,0 +1,622 @@
+import tensorflow as tf
+import numpy as np
+import matplotlib.pyplot as plt
+import pickle
+
+# TicTacToe game has nine stateus with nine actions. An user can put his ston on any postion in the borad except 
+
+def set_state_inplace(S, action, P_no): 
+    ''' S is numpy array.'''
+    assert S[action] == 0, 'position should be empty to put a new stone' 
+    S[action] = P_no # User numpy to insert action in the specific position
+
+def calc_reward(S):
+    mask_l = tf.constant([[1,1,1,0,0,0,0,0,0], [0,0,0,1,1,1,0,0,0], [0,0,0,0,0,0,1,1,1],
+                         [1,0,0,1,0,0,1,0,0], [0,1,0,0,1,0,0,1,0], [0,0,1,0,0,1,0,0,1],
+                         [1,0,0,0,1,0,0,0,1], [0,0,1,0,1,0,1,0,0]], dtype=tf.int16)
+    for mask in mask_l:
+        # print(mask)
+        mask_S = mask * S
+        # print(mask_S)
+
+        for player in [1,2]:
+            abs_err = tf.reduce_sum(tf.abs(mask_S - player * mask))
+            # print(abs_err)
+            if abs_err == 0:
+                # print(f'Player{player} wins')
+                return player
+    return 0    
+
+def calc_reward_tf(S):
+    mask_l = tf.constant([[1,1,1,0,0,0,0,0,0], [0,0,0,1,1,1,0,0,0], [0,0,0,0,0,0,1,1,1],
+                         [1,0,0,1,0,0,1,0,0], [0,1,0,0,1,0,0,1,0], [0,0,1,0,0,1,0,0,1],
+                         [1,0,0,0,1,0,0,0,1], [0,0,1,0,1,0,1,0,0]], dtype=tf.int32)
+
+    S = tf.constant(S, dtype=tf.int32)
+    S = tf.reshape(S, shape=(1,-1))
+    S_cp = tf.matmul(tf.ones((mask_l.shape[0],1),dtype=tf.int32), S)
+    mask_S = mask_l * S_cp 
+
+    for player in [1, 2]:
+        if tf.reduce_any(tf.reduce_sum(tf.abs(mask_S - player * mask_l),axis=1) == 0):
+            return player
+
+    return 0
+
+def one_of_amax(arr, disp_flag=False):
+    results = np.where(arr == np.amax(arr))[0]
+
+    if disp_flag:
+        print('Equally max actions:', results)
+
+    action = results[np.random.randint(0, len(results), 1)[0]]   
+    return action
+
+
+class Q_System:
+    def __init__(self, N_A=9, N_Symbols=3, epsilon=0.01, disp_flag=False):
+        """
+        N_A : Number of actions
+        N_Symbols : Number of possible symbols in each point: 0, 1, 2,
+        representing empty, player1, player2
+        N_S : Number of states
+        """        
+        if N_A is not None:
+            self.disp_flag = disp_flag
+            N_S = N_Symbols**N_A
+            self.Qsa = [np.zeros((N_S,N_A)), np.zeros((N_S,N_A))]       
+            self.N_A = N_A
+            self.N_Symbols = N_Symbols
+            self.epsilon = epsilon
+        else:
+            self.disp_flag = False
+            self.Qsa = None      
+            self.N_A = None
+            self.N_Symbols = None
+            self.epsilon = None
+
+    def save(self):
+        f = open('tictactoe_data.pckl', 'wb')
+        obj = [self.N_A, self.N_Symbols, self.epsilon, self.Qsa]
+        pickle.dump(obj, f)
+        f.close()
+
+    def load(self):
+        f = open('tictactoe_data.pckl', 'rb')
+        obj = pickle.load(f)
+        [self.N_A, self.N_Symbols, self.epsilon, self.Qsa] = obj
+        f.close()
+
+
+    def calc_S_idx(self, S):
+        S_idx = 0
+        unit = 1
+        for s in S:
+            S_idx += s*unit
+            unit *= self.N_Symbols
+        return S_idx
+
+    def _policy_random(self, S, a):
+        return 1 / self.N_A
+
+    def policy_random(self, P_no, S, action_list): 
+        action_prob = []
+        S_idx = self.calc_S_idx(S)
+        for _ in action_list:
+            action_prob.append(1/len(action_list))        
+        action_idx = tf.squeeze(tf.random.categorical(tf.math.log([action_prob]),1)).numpy()
+        if action_idx == len(action_prob): # if all zeros in actoin_prob
+            action = action_list[tf.squeeze(np.random.randint(0, len(action_list), 1))]
+        else:
+            action = action_list[action_idx]
+        if self.disp_flag:
+            print('S_idx', S_idx, 'action', action, 'action_list', action_list, 'action_prob', action_prob)
+        return action
+
+    def policy(self, P_no, S, action_list):
+        action_prob = []
+        S_idx = self.calc_S_idx(S)
+        for a in action_list:
+            action_prob.append(self.Qsa[P_no-1][S_idx,a])        
+
+        # We consider max Q with epsilon greedy
+        if tf.squeeze(tf.random.uniform([1,1])) > self.epsilon:
+            action = action_list[one_of_amax(action_prob)]            
+        else:
+            action = action_list[np.random.randint(0,len(action_list),1)[0]]
+
+        if self.disp_flag:
+            print('S_idx', S_idx, 'action', action, 
+                  'action_list', action_list, 'action_prob', action_prob)
+        return action
+
+    def _r0_policy(self, P_no, S, action_list):
+        action_prob = []
+        S_idx = self.calc_S_idx(S)
+        for a in action_list:
+            action_prob.append(self.Qsa[P_no-1][S_idx, a])        
+        action_idx = tf.squeeze(tf.random.categorical(tf.math.log([action_prob]),1)).numpy()
+        if action_idx == len(action_prob): # if all zeros in actoin_prob
+            action = action_list[tf.squeeze(np.random.randint(0, len(action_list), 1))]
+        else:
+            action = action_list[action_idx]
+        if self.disp_flag:
+            print('S_idx', S_idx, 'action', action, 'action_list', action_list, 'action_prob', action_prob)
+
+        return action
+
+    def find_action_list(self, S):
+        action_list = []
+        no_occupied = 0
+        for a in range(self.N_A):
+            if S[a] == 0:
+                action_list.append(a)
+            else:
+                no_occupied += 1
+        return action_list, no_occupied
+
+    # Take action_prob at the given state
+    def get_action(self, P_no, S):
+        """Return action, done
+        """
+        action_list, no_occupied = self.find_action_list(S)
+        # Since number of possible actions are reduced, 
+        # denominator is also updated. 
+        action = self.policy(P_no, S, action_list)
+        done = no_occupied == (self.N_A - 1)
+        return action, done
+
+    def get_action_with_random(self, P_no, S):
+        """Return action, done
+        """
+        action_list, no_occupied = self.find_action_list(S)
+        # Since number of possible actions are reduced, 
+        # denominator is also updated. 
+        if P_no == 1:
+            action = self.policy(P_no, S, action_list)
+        else:
+            action = self.policy_random(P_no, S, action_list)
+        done = no_occupied == (self.N_A - 1)
+        return action, done
+
+    def get_action_with_human(self, P_no, S):
+        """
+        action, done = get_action_with_human(self, P_no, S)
+        - Playing with human
+
+        [Inputs]
+
+        P_no : Human player index which represents 1=fist, 2=second playing
+        """
+        action_list, no_occupied = self.find_action_list(S)
+        # Since number of possible actions are reduced, 
+        # denominator is also updated. 
+        print('The current game state is:')
+        print(S.reshape(3,3))
+        print('Action index:')
+        print(np.array(range(9)).reshape(3,3))
+        print('Avaiable actions: ', action_list)
+        rand_idx = np.random.randint(0, len(action_list))
+        random_action = action_list[int(rand_idx)]
+
+        action = None
+        while action not in action_list:
+            action = input_default(f'Type your action (default={random_action}): ', random_action, int)
+            if action not in action_list:
+                print('Type action again among in the avaible action list:', action_list)
+
+        done = no_occupied == (self.N_A - 1)
+        return action, done
+
+    def play(self, P_no):
+        """ 
+        Buff = play(self, P_no)
+
+        [Inputs]  
+            P_no: player number, which is 1 or 2
+
+        [Returns]
+            Buff = {'P_no': [], 'S':[], 'a':[], 'r':[], 'S_next': []}: gathered information during learning
+                where S, a, r, S_next are state, action, rewrd, and next state
+
+        [Examples]
+            1. Buff = self.play(1)
+            2. Buff = self.play(2)
+        """
+        N_A = self.N_A
+        Buff = {'P_no': [], 'S':[], 'a':[], 'r':[], 'S_next': []}
+
+        S = np.zeros((N_A,),dtype='int16') # #state == #action
+
+        if self.disp_flag:
+            print('S:', S)
+
+        done = False
+        while done == False:
+            action, done = self.get_action_with_random(P_no, S)
+            Buff['P_no'].append(P_no)
+            Buff['S'].append(S.copy())
+            Buff['a'].append(action)
+            set_state_inplace(S, action, P_no)    
+            Buff['S_next'].append(S.copy())   
+
+            if self.disp_flag:
+                print('S:', S)
+
+            win_player = calc_reward_tf(S)
+            reward = 0 if win_player == 0 else 1
+            Buff['r'].append(reward)
+            P_no = 1 if P_no == 2 else 2
+
+            if win_player:
+                done = True                
+
+        if self.disp_flag:        
+            if win_player:
+                print(f'player {win_player} win')
+            else:
+                print(f'Tie game')
+
+        return Buff    
+
+    def play_with_human(self, player_human=1):
+        """ 
+        Buff = play_with_human(self, P_no)
+        - Playing with human
+
+        [Inputs]  
+
+        P_no: player number, which is 1 or 2
+
+        [Returns]
+
+        Buff = {'P_no': [], 'S':[], 'a':[], 'r':[], 'S_next': []}: gathered information during learning
+        where S, a, r, S_next are state, action, rewrd, and next state
+
+        [Examples]
+            1. Buff = self.play(1)
+            2. Buff = self.play(2)
+        """
+        N_A = self.N_A
+        Buff = {'P_no': [], 'S':[], 'a':[], 'r':[], 'S_next': []}
+
+        S = np.zeros((N_A,),dtype='int16') # #state == #action
+
+        if self.disp_flag:
+            print('S:', S)
+
+        P_no = 1 # set P_no = 1 while player_human could be 1 or 2        
+        done = False
+        while done == False:
+            if player_human == P_no:
+                action, done = self.get_action_with_human(P_no, S) 
+            else:
+                P_no_trained_agent = 1 # random agent is 2
+                action, done = self.get_action_with_random(P_no_trained_agent, S)
+            Buff['P_no'].append(P_no)
+            Buff['S'].append(S.copy())
+            Buff['a'].append(action)
+            set_state_inplace(S, action, P_no)    
+            Buff['S_next'].append(S.copy())   
+
+            if self.disp_flag:
+                print('S:', S)
+
+            win_player = calc_reward_tf(S)
+            reward = 0 if win_player == 0 else 1
+            Buff['r'].append(reward)
+            P_no = 1 if P_no == 2 else 2
+
+            if win_player:
+                done = True                
+
+        print(S.reshape(3,3))
+        if win_player == player_human:
+            print('You win')
+        elif win_player != 0:
+            print('You lose')
+        else:
+            print('Tie game')
+
+        return Buff    
+
+
+    def update_Qsa_inplace(self, Buff, ff=0.9, lr=0.01):
+
+        def discounted_inplace(Buff_r):
+            """discounted_inplace(Buff_r): 
+            Convert a reward vector to a discounted return vector using ff,
+            where ff means forgeting factor.
+
+            [Input] Buff_r = Buff[r]: stores rewards in each episode
+            """
+            g_prev = 0
+            for i, r_l in enumerate(reversed(Buff_r)):
+                Buff_r[-i-1] = r_l + ff * g_prev
+                g_prev = Buff_r[-i-1]
+
+        def updateQsa_inplace(Qsa_player, Buff_player):    
+            if self.disp_flag:            
+                print('---------------------------------------')
+                print('S, S_idx, a, lr * r, Qsa_player[S_idx,a]')
+
+            for S, a, r in zip(Buff_player['S'], Buff_player['a'], Buff_player['r']):        
+                S_idx = self.calc_S_idx(S)
+                Qsa_player[S_idx,a] += lr * r
+
+                if self.disp_flag:
+                    print(S, S_idx, a, lr * r, Qsa_player[S_idx,a])
+
+        # def updateQsa_stages_inplace(player, Qsa_player, Buff_player, disp_flag=True):
+
+        def buff_depart(Buff):
+            Buff_dual = [{'S':[], 'a':[], 'r':[]}, {'S':[], 'a':[], 'r':[]}]
+            for i, (p, S, a, r) in enumerate(zip(Buff['P_no'], Buff['S'], Buff['a'], Buff['r'])):
+                if i > 0:
+                    # final reward for a player is reward of a next player
+                    prev_p = 2 if p==1 else 1
+                    Buff_dual[prev_p-1]['r'][-1] = -r # 1 for player#2 --> -1 for player#1, vice versa
+                    if self.disp_flag:
+                        print('i, prev_p, Buff_dual[prev_p-1]')
+                        print(i, prev_p, Buff_dual[prev_p-1])
+                Buff_dual[p-1]['S'].append(S)
+                Buff_dual[p-1]['a'].append(a)
+                Buff_dual[p-1]['r'].append(r)  
+                if self.disp_flag:
+                    print('i, p, Buff_dual[p-1]')
+                    print(i, p, Buff_dual[p-1])
+
+            return Buff_dual                
+
+        Buff_dual = buff_depart(Buff)
+
+        # player#1
+        for player in [1,2]:
+            discounted_inplace(Buff_dual[player-1]['r']) # for player#1
+
+            if self.disp_flag:
+                print('player:', player)
+                print("Buff_dual[player-1]['r']", Buff_dual[player-1]['r'])
+
+            updateQsa_inplace(self.Qsa[player-1], Buff_dual[player-1])
+            # updateQsa_stages_inplace(player, self.Qsa_stages[player-1], Buff_dual[player-1])
+
+    def learning(self, N_episodes=2, ff=0.9, lr=0.01, print_cnt=10):
+        """Return: 
+            cnt_trace = [cnt, ...]: cnt vector are stacked in cnt_trace
+        """
+        cnt = [0, 0, 0] # tie, p1, p2
+        cnt_trace = [cnt.copy()]        
+        player = 1
+        for episode in range(N_episodes):
+            # print('===================================')
+            # Can save this data for play 2 as well
+
+            # Decrease epsilon with respect to epside
+            self.epsilon = 1 / (1 + episode/100)
+            # self.epsilon =  1 / (1 + episode)
+
+            Buff = self.play(player)    
+            self.update_Qsa_inplace(Buff, ff=ff, lr=lr)    
+            win_player = 0 if Buff['r'][-1] == 0 else Buff['P_no'][-1]            
+            cnt[win_player] += 1
+            cnt_trace.append(cnt.copy())
+
+            player = 2 if player == 1 else 1
+            if episode % print_cnt == 0:
+                print(episode, cnt)
+
+                print('S = [0,0,0, 0,0,0, 0,0,0]')
+                print('Qsa[0][0,:]', [f'{self.Qsa[0][0,a]:.1e}' for a in range(9)])
+                print('Qsa[1][0,:]', [f'{self.Qsa[1][0,a]:.1e}' for a in range(9)])
+
+                S = [1,1,0, 2,1,2, 1,2,2]
+                S_idx = self.calc_S_idx(S)
+                print('S = ', S)
+                print(f'Qsa[0][{S_idx},:]', [f'{self.Qsa[0][S_idx,a]:.1e}' for a in range(9)])
+                print(f'Qsa[1][{S_idx},:]', [f'{self.Qsa[1][S_idx,a]:.1e}' for a in range(9)])
+
+                S = [1,1,0, 2,0,0, 2,0,0]
+                S_idx = self.calc_S_idx(S)
+                print('S = ', S)
+                print(f'Qsa[0][{S_idx},:]', [f'{self.Qsa[0][S_idx,a]:.1e}' for a in range(9)])
+                print(f'Qsa[1][{S_idx},:]', [f'{self.Qsa[1][S_idx,a]:.1e}' for a in range(9)])
+
+        return cnt_trace
+
+class Testing:
+    def __init__(self, fn_name):
+        '''Usages:
+            - Testing('calc_reward_tf')
+        '''
+        if fn_name == 'calc_reward_tf':
+            self.test_calc_reward_tf()
+        elif fn_name == 'find_action_list':
+            self.test_find_action_list()
+        elif fn_name == 'get_action':
+            self.test_get_action()
+        elif fn_name == 'all':
+            self.test_calc_reward_tf()
+            self.test_find_action_list()
+            self.test_get_action()
+
+    def test_calc_reward_tf(self):
+        S_examples = tf.constant([[0,0,0, 0,0,0, 0,0,0],
+                                 [1,1,1, 2,0,2, 2,0,0],
+                                 [0,0,2, 1,2,1, 2,0,0]])
+
+        print('===================================')
+        print('Testing: calc_reward_tf')
+        print('[Anwer]')
+        answer = [0, 1, 2]
+        print(answer)
+
+        print('-------------------------------------')
+        print('[Test]')
+        test = [calc_reward_tf(S) for S in S_examples] 
+        print(test)  
+        if test == answer:
+            print('Test Ok')
+        else:
+            print('Test fail')
+
+    def test_find_action_list(self):
+        print('===================================')
+        print('Testing: test_find_action_list')
+        print('[Answer]')
+        print('''[[0 0 0]
+ [0 0 0]
+ [0 0 0]] [0, 1, 2, 3, 4, 5, 6, 7, 8] 0
+[[0 2 0]
+ [0 1 0]
+ [1 0 2]] [0, 2, 3, 5, 7] 4''')
+
+        N_A = 9
+        N_Symbols = 3 
+        my_Q_System = Q_System(N_A, N_Symbols)
+
+        print('-------------------------------------')
+        print('[Test]')
+        S_l = [[0,0,0, 0,0,0, 0,0,0], [0,2,0, 0,1,0, 1,0,2]]
+        for S in S_l:
+            action_list, no_occupied = my_Q_System.find_action_list(S)
+            print(np.reshape(S,(3,3)), action_list, no_occupied)
+
+    def test_get_action(self):
+        print('===================================')
+        print('Testing: get_action')        
+        print('''[Answer]
+Equally max actions: [0]
+S_idx 0 action 0 action_list [0, 1, 2, 3, 4, 5, 6, 7, 8] action_prob [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+[[0 0 0]
+ [0 0 0]
+ [0 0 0]] 1 0
+Equally max actions: [0]
+S_idx 13950 action 0 action_list [0, 1, 3, 5, 7] action_prob [1.0, 0.0, 0.0, 0.0, 0.0]
+[[0 0 2]
+ [0 1 0]
+ [1 0 2]] 1 0
+Equally max actions: [0]
+S_idx 0 action 0 action_list [0, 1, 2, 3, 4, 5, 6, 7, 8] action_prob [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+[[0 0 0]
+ [0 0 0]
+ [0 0 0]] 2 0
+Equally max actions: [0]
+S_idx 13950 action 0 action_list [0, 1, 3, 5, 7] action_prob [1.0, 0.0, 0.0, 0.0, 0.0]
+[[0 0 2]
+ [0 1 0]
+ [1 0 2]] 2 0''')
+        N_A = 9
+        N_Symbols = 3 
+        my_Q_System = Q_System(N_A, N_Symbols)
+
+        print('-------------------------------------')
+        print('[Test]')
+        S_l = [[0,0,0, 0,0,0, 0,0,0], [0,0,2, 0,1,0, 1,0,2]]
+        for P_no in [1,2]:
+            for S in S_l:
+                S_idx = my_Q_System.calc_S_idx(S)
+                my_Q_System.Qsa[P_no-1][S_idx,:] = np.array([1.0,0.0,0, 0,0,0, 0,0,0])    
+                action, _ = my_Q_System.get_action(P_no, S)
+                print(np.reshape(S,(3,3)), P_no, action)
+
+
+
+def _main():
+    ff = 0.9
+    lr = 0.01
+    N_episodes = 2
+    N_Symbols = 3 # 0=empty, 1=plyaer1, 2=player2
+    N_A = 9 # (0,0), (0,1), ..., (2,2)
+
+    my_Q_System = Q_System(N_A, N_Symbols)
+    cnt = [0, 0, 0] # tie, p1, p2
+    player = 1
+    for episode in range(N_episodes):
+        # print('===================================')
+        # Can save this data for play 2 as well
+        Buff = my_Q_System.play(player)    
+        my_Q_System.update_Qsa_inplace(Buff, ff=ff, lr=lr)    
+        win_player = 0 if Buff['r'][-1] == 0 else Buff['P_no'][-1]
+        cnt[win_player] += 1
+
+        player = 2 if player == 1 else 1
+        if episode % 10 == 0:
+            print(episode, cnt)
+
+    print(cnt)
+
+
+def plot_cnt_trace(cnt_trace):
+    N_cnt = len(cnt_trace)
+    cnt_d = {'Equal':np.zeros(N_cnt,dtype=int), 'P1':np.zeros(N_cnt,dtype=int), 'P2':np.zeros(N_cnt,dtype=int)}
+    for i, cnt in enumerate(cnt_trace):
+        cnt_d['Equal'][i] = cnt[0]
+        cnt_d['P1'][i] = cnt[1]
+        cnt_d['P2'][i] = cnt[2]
+    plt.plot(range(N_cnt), cnt_d['Equal'], label='Equal')
+    plt.plot(range(N_cnt), cnt_d['P1'], label='Plyaer1 wins')
+    plt.plot(range(N_cnt), cnt_d['P2'], label='Plyaer2 wins')
+    plt.xlabel('Episode')
+    plt.ylabel('Count')
+    plt.legend(loc=0)
+    plt.title('Learned (P#1) vs. Random (P#2) policies during learning')
+    plt.show(True)
+
+
+def learning_stage(N_episodes=100, save_flag=True, fig_flag=False):
+    ff = 0.9
+    lr = 0.01
+    N_Symbols = 3 # 0=empty, 1=plyaer1, 2=player2
+    N_A = 9 # (0,0), (0,1), ..., (2,2)
+    print_cnt = N_episodes / 10
+
+    my_Q_System = Q_System(N_A, N_Symbols)
+    cnt_trace = my_Q_System.learning(N_episodes=N_episodes, ff=ff, lr=lr, print_cnt=print_cnt)
+    print('-------------------')
+    cnt = cnt_trace[-1]
+    print(N_episodes, cnt)
+
+    if save_flag:
+        my_Q_System.save()
+
+    if fig_flag:
+        plot_cnt_trace(cnt_trace)
+
+    return my_Q_System
+
+
+def input_default(str, defalut_value, dtype=int):
+    answer = input(str)
+    if answer == '':
+        return defalut_value
+    else:
+        return dtype(answer)
+
+
+def main():
+    Q1 = input_default('1. Loading a trained agent (0) or Learning a new agent (1)? (default=0) ', 0, int)
+    if Q1 == 0:
+        print('Loading the trained agent...')
+        Q2 = input_default('2. Do you want to start first?(0=yes,1=no,default=0) ', 0, int)
+        player_human = Q2 + 1
+        if player_human == 1:
+            print('You=1, Agent=2') 
+        else:
+            print('Agent=1, You=2') 
+        trained_Q_System = Q_System(None)
+        trained_Q_System.load()
+        trained_Q_System.play_with_human(player_human)
+        # print(len(trained_Q_System.Qsa))
+    else:
+        print('Start to learn a new agent...')
+        Q2 = input_default('2. How many episode do you want to learn?(default=10000) ', 10000, int)
+        # my_Q_System = learning_stage(N_episodes=Q2, fig_flag=True)
+        _ = learning_stage(N_episodes=Q2, fig_flag=True)
+        # print(len(my_Q_System.Qsa))
+
+
+if __name__ == "__main__":
+    main()
+    # Testing('all')
+    pass