A Reinforcement Learning Example¶
In [ ]:
import numpy as np
from collections import defaultdict
class TicTacToeEnv:
def __init__(self):
self.board = [" "] * 9
self.current_player = "X"
def get_state(self):
return "".join(self.board)
def get_legal_moves(self):
return [i for i, mark in enumerate(self.board) if mark == " "]
def make_move(self, position):
if self.board[position] != " ":
return False
self.board[position] = self.current_player
self.current_player = "O" if self.current_player == "X" else "X"
return True
def check_winner(self):
# Check rows, columns, and diagonals
lines = [(0,1,2), (3,4,5), (6,7,8), # rows
(0,3,6), (1,4,7), (2,5,8), # columns
(0,4,8), (2,4,6)] # diagonals
for line in lines:
if (self.board[line[0]] != " " and
self.board[line[0]] == self.board[line[1]] == self.board[line[2]]):
return self.board[line[0]]
if " " not in self.board:
return "Draw"
return None
def print_board(self):
for i in range(0, 9, 3):
print(f"{self.board[i]}|{self.board[i+1]}|{self.board[i+2]}")
if i < 6:
print("-+-+-")
class QLearningAgent:
def __init__(self, epsilon=0.1, learning_rate=0.1, discount=0.95):
self.q_table = defaultdict(lambda: defaultdict(float))
self.epsilon = epsilon # exploration rate
self.learning_rate = learning_rate
self.discount = discount
def choose_move(self, env):
state = env.get_state()
legal_moves = env.get_legal_moves()
# Exploration: random move
if np.random.random() < self.epsilon:
return np.random.choice(legal_moves)
# Exploitation: best known move
return self._get_best_move(state, legal_moves)
def _get_best_move(self, state, legal_moves):
best_value = float('-inf')
best_moves = []
for move in legal_moves:
value = self.q_table[state][move]
if value > best_value:
best_value = value
best_moves = [move]
elif value == best_value:
best_moves.append(move)
return np.random.choice(best_moves)
def learn(self, state, action, reward, next_state, next_legal_moves):
# Current Q-value
current_q = self.q_table[state][action]
# Best possible future Q-value
next_q = max([self.q_table[next_state][next_move]
for next_move in next_legal_moves]) if next_legal_moves else 0
# Q-learning update rule
self.q_table[state][action] = current_q + self.learning_rate * (
reward + self.discount * next_q - current_q)
def train_agent(episodes=1000, eval_frequency=100, training_mode="self_play"):
"""
Train a Q-learning agent for Tic-Tac-Toe
Args:
episodes: Number of games to play during training
eval_frequency: How often to evaluate the agent
training_mode: "self_play" or "random_opponent"
Returns:
best_agent: The best performing agent during training
training_metrics: List of evaluation metrics throughout training
"""
# Create agent(s)
agent_x = QLearningAgent()
agent_o = QLearningAgent() if training_mode == "self_play" else None
training_metrics = []
# For tracking agent progress
best_win_rate = 0.0
best_agent = None
for episode in range(episodes):
env = TicTacToeEnv()
if training_mode == "self_play":
# Self-play training mode
game_history_x = [] # [(state, action, next_state), ...]
game_history_o = []
while True:
# X's turn
state = env.get_state()
action = agent_x.choose_move(env)
env.make_move(action)
next_state = env.get_state()
game_history_x.append((state, action, next_state))
# Check if game ended after X's move
winner = env.check_winner()
if winner:
break
# O's turn
state = env.get_state()
action = agent_o.choose_move(env)
env.make_move(action)
next_state = env.get_state()
game_history_o.append((state, action, next_state))
# Check if game ended after O's move
winner = env.check_winner()
if winner:
break
# Assign rewards and learn from the complete game
reward_x = 1.0 if winner == "X" else -1.0 if winner == "O" else 0.0
reward_o = -reward_x # Zero-sum game
# X learns from its moves
for state, action, next_state in game_history_x:
agent_x.learn(state, action, reward_x, next_state,
env.get_legal_moves() if winner is None else [])
# O learns from its moves
for state, action, next_state in game_history_o:
agent_o.learn(state, action, reward_o, next_state,
env.get_legal_moves() if winner is None else [])
else:
# Random opponent mode
state = env.get_state()
while True:
# Agent's turn (X)
action = agent_x.choose_move(env)
env.make_move(action)
next_state = env.get_state()
# Check if game ended after agent's move
winner = env.check_winner()
if winner:
reward = 1.0 if winner == "X" else -1.0 if winner == "O" else 0.0
agent_x.learn(state, action, reward, next_state, [])
break
# Random opponent's turn (O)
o_move = np.random.choice(env.get_legal_moves())
env.make_move(o_move)
after_o_state = env.get_state()
# Check if game ended after opponent's move
winner = env.check_winner()
if winner:
reward = 1.0 if winner == "X" else -1.0 if winner == "O" else 0.0
agent_x.learn(state, action, reward, after_o_state, [])
break
# If game continues, learn from this state-action pair
agent_x.learn(state, action, 0, after_o_state, env.get_legal_moves())
state = after_o_state
# Evaluate agents periodically
if episode % eval_frequency == 0:
wins, losses, draws = evaluate_agent(agent_x, n_games=100)
training_metrics.append({
'episode': episode,
'wins': wins,
'losses': losses,
'draws': draws
})
print(f"Episode {episode}: Wins: {wins}%, Losses: {losses}%, Draws: {draws}%")
# Keep track of best performing agent
if wins > best_win_rate:
best_win_rate = wins
best_agent = agent_x
return best_agent, training_metrics
def evaluate_agent(agent, n_games=100):
"""Evaluate agent performance without exploration"""
original_epsilon = agent.epsilon
agent.epsilon = 0 # Turn off exploration
wins = losses = draws = 0
for _ in range(n_games):
env = TicTacToeEnv()
while True:
# Agent's turn (X)
action = agent.choose_move(env)
env.make_move(action)
winner = env.check_winner()
if winner:
if winner == "X": wins += 1
elif winner == "O": losses += 1
else: draws += 1
break
# Random opponent's turn (O)
o_moves = env.get_legal_moves()
if o_moves:
o_move = np.random.choice(o_moves)
env.make_move(o_move)
winner = env.check_winner()
if winner:
if winner == "X": wins += 1
elif winner == "O": losses += 1
else: draws += 1
break
else:
# No legal moves for opponent, must be a draw
draws += 1
break
agent.epsilon = original_epsilon # Restore exploration
return (wins/n_games)*100, (losses/n_games)*100, (draws/n_games)*100
def play_game(agent, human_player="O"):
env = TicTacToeEnv()
print("Game starts! Positions are numbered 0-8, left to right, top to bottom")
print("0|1|2\n-+-+-\n3|4|5\n-+-+-\n6|7|8")
print("\nCurrent board:")
env.print_board()
while True:
# Agent's turn if it's X's turn and human is O (or vice versa)
if env.current_player != human_player:
move = agent.choose_move(env)
print(f"\nAgent plays position {move}")
else:
while True:
try:
move = int(input(f"\nYour turn ({human_player}). Enter position (0-8): "))
if move in env.get_legal_moves():
break
print("Invalid move, try again")
except ValueError:
print("Please enter a number between 0 and 8")
env.make_move(move)
print("\nCurrent board:")
env.print_board()
winner = env.check_winner()
if winner:
if winner == "Draw":
print("\nGame Over - Draw!")
else:
print(f"\nGame Over - {winner} wins!")
break
# Example usage:
if __name__ == "__main__":
print("Which training mode would you like to use?")
print("1. Self-play (agent learns by playing against itself)")
print("2. Random opponent (agent learns by playing against random moves)")
choice = input("Enter 1 or 2: ")
training_mode = "self_play" if choice == "1" else "random_opponent"
print(f"\nTraining agent using {training_mode} mode...")
best_agent, training_metrics = train_agent(
episodes=10000,
eval_frequency=500,
training_mode=training_mode
)
print("\nTraining complete! Let's play against the best agent!")
play_game(best_agent)
In [ ]:
%debug