Skip to content

Instantly share code, notes, and snippets.

@zh4ngx
Created July 1, 2017 03:20
Show Gist options
  • Select an option

  • Save zh4ngx/8fd7892f79a2b9a090bfecb645b1e3d9 to your computer and use it in GitHub Desktop.

Select an option

Save zh4ngx/8fd7892f79a2b9a090bfecb645b1e3d9 to your computer and use it in GitHub Desktop.

Revisions

  1. Andy Zhang created this gist Jul 1, 2017.
    93 changes: 93 additions & 0 deletions hill_climb_3.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,93 @@
    import gym
    import numpy as np
    from gym.wrappers.monitoring import Monitor

    MC_POLICY_EVAL_EP = 10
    BASE_NOISE_FACTOR = 0.5
    NUM_POLICY_EVAL = 500


    env = gym.make('CartPole-v0')
    env = Monitor(env, 'tmp/cart-pole-hill-climb-3', force=True)

    print("Action space: {0}".format(env.action_space))
    print("Observation space: {0}\n\tLow: {1}\n\tHigh: {2}".format(
    env.observation_space,
    env.observation_space.low,
    env.observation_space.high,
    ))


    def action_selection(weights, observation):
    if np.matmul(weights, observation) < 0:
    return 0
    else:
    return 1


    def run_episode(weights):
    observation = env.reset()
    total_reward = 0
    for t in range(200):
    env.render()
    action = action_selection(weights, observation)
    observation, reward, done, info = env.step(action)
    total_reward += reward
    if done:
    print("Episode finished after {0} timesteps with reward {1}".format(
    t + 1,
    total_reward,
    ))
    break

    return total_reward


    def evaluate_policy(num_episodes, weights):
    mean_reward = 0
    for k in range(1, num_episodes + 1):
    reward = run_episode(weights)
    error = reward - mean_reward
    mean_reward += error / k

    print("Mean reward estimated as {0} for past {1} episodes".format(
    mean_reward,
    num_episodes
    ))
    return mean_reward


    best_reward = -np.inf
    best_params = np.random.rand(4) * 2 - 1

    print("Running Hill Climb on Cart Pole")
    print("Params:\n\tMC Eval Count: {0} trajectories\n\tBase Noise Factor: {1}".format(
    MC_POLICY_EVAL_EP,
    BASE_NOISE_FACTOR,
    ))

    for i_episode in range(NUM_POLICY_EVAL):
    # Weights are 1x4 matrix
    # µ = 0 , sigma 1
    annealing_term = 1 - (i_episode / NUM_POLICY_EVAL)
    noise_scaling = BASE_NOISE_FACTOR * annealing_term
    print("Applying jitter with factor {0} to parameters {1}".format(
    noise_scaling,
    best_params,
    ))

    # Add gaussian noise
    # µ = 0 , sigma = noise_scaling
    noise_term = np.random.randn(4) * noise_scaling
    parameters = best_params + noise_term
    episodic_reward = evaluate_policy(MC_POLICY_EVAL_EP, parameters)
    if episodic_reward > best_reward:
    print("Episode {2}: Got new best reward of {0}, better than previous of {1}".format(
    episodic_reward,
    best_reward,
    i_episode,
    ))
    best_reward = episodic_reward
    best_params = parameters

    env.close()