A Reinforcement Learning Example¶

In [ ]:
import numpy as np
from collections import defaultdict

class TicTacToeEnv:
    def __init__(self):
        self.board = [" "] * 9
        self.current_player = "X"
    
    def get_state(self):
        return "".join(self.board)
    
    def get_legal_moves(self):
        return [i for i, mark in enumerate(self.board) if mark == " "]
    
    def make_move(self, position):
        if self.board[position] != " ":
            return False
        self.board[position] = self.current_player
        self.current_player = "O" if self.current_player == "X" else "X"
        return True
    
    def check_winner(self):
        # Check rows, columns, and diagonals
        lines = [(0,1,2), (3,4,5), (6,7,8),  # rows
                (0,3,6), (1,4,7), (2,5,8),  # columns
                (0,4,8), (2,4,6)]           # diagonals
        
        for line in lines:
            if (self.board[line[0]] != " " and
                self.board[line[0]] == self.board[line[1]] == self.board[line[2]]):
                return self.board[line[0]]
        
        if " " not in self.board:
            return "Draw"
        return None
    
    def print_board(self):
        for i in range(0, 9, 3):
            print(f"{self.board[i]}|{self.board[i+1]}|{self.board[i+2]}")
            if i < 6:
                print("-+-+-")

class QLearningAgent:
    def __init__(self, epsilon=0.1, learning_rate=0.1, discount=0.95):
        self.q_table = defaultdict(lambda: defaultdict(float))
        self.epsilon = epsilon  # exploration rate
        self.learning_rate = learning_rate
        self.discount = discount
    
    def choose_move(self, env):
        state = env.get_state()
        legal_moves = env.get_legal_moves()
        
        # Exploration: random move
        if np.random.random() < self.epsilon:
            return np.random.choice(legal_moves)
        
        # Exploitation: best known move
        return self._get_best_move(state, legal_moves)
    
    def _get_best_move(self, state, legal_moves):
        best_value = float('-inf')
        best_moves = []
        
        for move in legal_moves:
            value = self.q_table[state][move]
            if value > best_value:
                best_value = value
                best_moves = [move]
            elif value == best_value:
                best_moves.append(move)
        
        return np.random.choice(best_moves)
    
    def learn(self, state, action, reward, next_state, next_legal_moves):
        # Current Q-value
        current_q = self.q_table[state][action]
        
        # Best possible future Q-value
        next_q = max([self.q_table[next_state][next_move] 
                     for next_move in next_legal_moves]) if next_legal_moves else 0
        
        # Q-learning update rule
        self.q_table[state][action] = current_q + self.learning_rate * (
            reward + self.discount * next_q - current_q)

def train_agent(episodes=1000, eval_frequency=100, training_mode="self_play"):
    """
    Train a Q-learning agent for Tic-Tac-Toe
    
    Args:
        episodes: Number of games to play during training
        eval_frequency: How often to evaluate the agent
        training_mode: "self_play" or "random_opponent"
    
    Returns:
        best_agent: The best performing agent during training
        training_metrics: List of evaluation metrics throughout training
    """
    # Create agent(s)
    agent_x = QLearningAgent()
    agent_o = QLearningAgent() if training_mode == "self_play" else None
    training_metrics = []
    
    # For tracking agent progress
    best_win_rate = 0.0
    best_agent = None
    
    for episode in range(episodes):
        env = TicTacToeEnv()
        
        if training_mode == "self_play":
            # Self-play training mode
            game_history_x = []  # [(state, action, next_state), ...]
            game_history_o = []
            
            while True:
                # X's turn
                state = env.get_state()
                action = agent_x.choose_move(env)
                env.make_move(action)
                next_state = env.get_state()
                game_history_x.append((state, action, next_state))
                
                # Check if game ended after X's move
                winner = env.check_winner()
                if winner:
                    break
                
                # O's turn
                state = env.get_state()
                action = agent_o.choose_move(env)
                env.make_move(action)
                next_state = env.get_state()
                game_history_o.append((state, action, next_state))
                
                # Check if game ended after O's move
                winner = env.check_winner()
                if winner:
                    break
            
            # Assign rewards and learn from the complete game
            reward_x = 1.0 if winner == "X" else -1.0 if winner == "O" else 0.0
            reward_o = -reward_x  # Zero-sum game
            
            # X learns from its moves
            for state, action, next_state in game_history_x:
                agent_x.learn(state, action, reward_x, next_state, 
                             env.get_legal_moves() if winner is None else [])
            
            # O learns from its moves
            for state, action, next_state in game_history_o:
                agent_o.learn(state, action, reward_o, next_state,
                             env.get_legal_moves() if winner is None else [])
        
        else:
            # Random opponent mode
            state = env.get_state()
            
            while True:
                # Agent's turn (X)
                action = agent_x.choose_move(env)
                env.make_move(action)
                next_state = env.get_state()
                
                # Check if game ended after agent's move
                winner = env.check_winner()
                if winner:
                    reward = 1.0 if winner == "X" else -1.0 if winner == "O" else 0.0
                    agent_x.learn(state, action, reward, next_state, [])
                    break
                
                # Random opponent's turn (O)
                o_move = np.random.choice(env.get_legal_moves())
                env.make_move(o_move)
                after_o_state = env.get_state()
                
                # Check if game ended after opponent's move
                winner = env.check_winner()
                if winner:
                    reward = 1.0 if winner == "X" else -1.0 if winner == "O" else 0.0
                    agent_x.learn(state, action, reward, after_o_state, [])
                    break
                
                # If game continues, learn from this state-action pair
                agent_x.learn(state, action, 0, after_o_state, env.get_legal_moves())
                state = after_o_state
    
        # Evaluate agents periodically
        if episode % eval_frequency == 0:
            wins, losses, draws = evaluate_agent(agent_x, n_games=100)
            training_metrics.append({
                'episode': episode,
                'wins': wins,
                'losses': losses,
                'draws': draws
            })
            print(f"Episode {episode}: Wins: {wins}%, Losses: {losses}%, Draws: {draws}%")
            
            # Keep track of best performing agent
            if wins > best_win_rate:
                best_win_rate = wins
                best_agent = agent_x
    
    return best_agent, training_metrics

def evaluate_agent(agent, n_games=100):
    """Evaluate agent performance without exploration"""
    original_epsilon = agent.epsilon
    agent.epsilon = 0  # Turn off exploration
    
    wins = losses = draws = 0
    
    for _ in range(n_games):
        env = TicTacToeEnv()
        while True:
            # Agent's turn (X)
            action = agent.choose_move(env)
            env.make_move(action)
            
            winner = env.check_winner()
            if winner:
                if winner == "X": wins += 1
                elif winner == "O": losses += 1
                else: draws += 1
                break
            
            # Random opponent's turn (O)
            o_moves = env.get_legal_moves()
            if o_moves:
                o_move = np.random.choice(o_moves)
                env.make_move(o_move)
                
                winner = env.check_winner()
                if winner:
                    if winner == "X": wins += 1
                    elif winner == "O": losses += 1
                    else: draws += 1
                    break
            else:
                # No legal moves for opponent, must be a draw
                draws += 1
                break
    
    agent.epsilon = original_epsilon  # Restore exploration
    return (wins/n_games)*100, (losses/n_games)*100, (draws/n_games)*100

def play_game(agent, human_player="O"):
    env = TicTacToeEnv()
    
    print("Game starts! Positions are numbered 0-8, left to right, top to bottom")
    print("0|1|2\n-+-+-\n3|4|5\n-+-+-\n6|7|8")
    print("\nCurrent board:")
    env.print_board()
    
    while True:
        # Agent's turn if it's X's turn and human is O (or vice versa)
        if env.current_player != human_player:
            move = agent.choose_move(env)
            print(f"\nAgent plays position {move}")
        else:
            while True:
                try:
                    move = int(input(f"\nYour turn ({human_player}). Enter position (0-8): "))
                    if move in env.get_legal_moves():
                        break
                    print("Invalid move, try again")
                except ValueError:
                    print("Please enter a number between 0 and 8")
        
        env.make_move(move)
        print("\nCurrent board:")
        env.print_board()
        
        winner = env.check_winner()
        if winner:
            if winner == "Draw":
                print("\nGame Over - Draw!")
            else:
                print(f"\nGame Over - {winner} wins!")
            break

# Example usage:
if __name__ == "__main__":
    print("Which training mode would you like to use?")
    print("1. Self-play (agent learns by playing against itself)")
    print("2. Random opponent (agent learns by playing against random moves)")
    choice = input("Enter 1 or 2: ")
    
    training_mode = "self_play" if choice == "1" else "random_opponent"
    print(f"\nTraining agent using {training_mode} mode...")
    
    best_agent, training_metrics = train_agent(
        episodes=10000, 
        eval_frequency=500,
        training_mode=training_mode
    )
    
    print("\nTraining complete! Let's play against the best agent!")
    play_game(best_agent)
In [ ]:
%debug