import random

student_name = "Type your full name here."


# 1. Q-Learning
class QLearningAgent:
    """Implement Q Reinforcement Learning Agent using Q-table."""

    def __init__(self, game, discount, learning_rate, explore_prob):
        """Store any needed parameters into the agent object.
        Initialize Q-table.
        """
        self.game = game
        self.discount = discount  # γ - how much we value future rewards
        self.learning_rate = learning_rate  # α - how fast we learn
        self.explore_prob = explore_prob  # ε - probability of exploration

        # Q-table: dictionary mapping (state, action) → Q-value
        self.q_table = {}

    def get_q_value(self, state, action):
        """Retrieve Q-value from Q-table.
        For an never seen (s,a) pair, the Q-value is by default 0.
        """
        # Return stored Q-value, or 0 if never seen before
        return self.q_table.get((state, action), 0.0)

    def get_value(self, state):
        """Compute state value from Q-values using Bellman Equation.
        V(s) = max_a Q(s,a)
        """
        actions = self.game.get_actions(state)

        # If no actions (terminal state), value is 0
        if not actions:
            return 0.0

        # V(s) = max over all actions of Q(s,a)
        return max(self.get_q_value(state, action) for action in actions)

    def get_best_policy(self, state):
        """Compute the best action to take in the state using Policy Extraction.
        π(s) = argmax_a Q(s,a)

        If there are ties, return a random one for better performance.
        Hint: use random.choice().
        """
        actions = self.game.get_actions(state)

        # If no actions available, return None
        if not actions:
            return None

        # Find the maximum Q-value
        max_q = max(self.get_q_value(state, action) for action in actions)

        # Find all actions that have this maximum Q-value (handle ties)
        best_actions = [action for action in actions
                        if self.get_q_value(state, action) == max_q]

        # Return random choice among best actions
        return random.choice(best_actions)

    def update(self, state, action, next_state, reward):
        """Update Q-values using running average.
        Q(s,a) = (1 - α) Q(s,a) + α (R + γ V(s'))
        Where α is the learning rate, and γ is the discount.

        Note: You should not call this function in your code.
        """
        # Get current Q-value (use get_q_value for abstraction!)
        current_q = self.get_q_value(state, action)

        # Get value of next state: V(s') = max_a Q(s',a)
        next_value = self.get_value(next_state)

        # Q-learning update formula
        # Q(s,a) ← (1-α)Q(s,a) + α[R + γV(s')]
        new_q = (1 - self.learning_rate) * current_q + \
                self.learning_rate * (reward + self.discount * next_value)

        # Store updated Q-value
        self.q_table[(state, action)] = new_q

    # 2. Epsilon Greedy
    def get_action(self, state):
        """Compute the action to take for the agent, incorporating exploration.
        That is, with probability ε, act randomly.
        Otherwise, act according to the best policy.

        Hint: use random.random() < ε to check if exploration is needed.
        """
        actions = self.game.get_actions(state)

        # If no actions available, return None
        if not actions:
            return None

        # Epsilon-greedy strategy
        if random.random() < self.explore_prob:
            # Explore: choose random action
            return random.choice(list(actions))
        else:
            # Exploit: choose best action
            return self.get_best_policy(state)


# 3. Bridge Crossing Revisited
def question3():
    """
    For Q-Learning to learn to cross the bridge with noise=0:

    The agent starts random. With epsilon=0 (no exploration), if it randomly
    goes LEFT first, it gets +1 reward and learns that LEFT is good.
    It will NEVER explore going RIGHT to discover the +10 reward.

    With epsilon > 0, there's exploration. Even if it learns LEFT is good,
    it will occasionally explore RIGHT and discover +10 is better.

    Learning rate should be high enough to learn quickly in 50 episodes.
    """
    epsilon = 0.5  # Need exploration to discover both paths
    learning_rate = 0.5  # Moderate learning rate to learn from experience
    return epsilon, learning_rate
    # If not possible, return 'NOT POSSIBLE'


# 5. Approximate Q-Learning
class ApproximateQAgent(QLearningAgent):
    """Implement Approximate Q Learning Agent using weights."""

    def __init__(self, *args, extractor):
        """Initialize parameters and store the feature extractor.
        Initialize weights table."""
        super().__init__(*args)

        # Feature extractor function: takes (state, action) → features dict
        self.extractor = extractor

        # Weights table: dictionary mapping feature_name → weight
        self.weights = {}

    def get_weight(self, feature):
        """Get weight of a feature.
        Never seen feature should have a weight of 0.
        """
        return self.weights.get(feature, 0.0)

    def get_q_value(self, state, action):
        """Compute Q value based on the dot product of feature components and weights.
        Q(s,a) = w_1 * f_1(s,a) + w_2 * f_2(s,a) + ... + w_n * f_n(s,a)
        """
        # Get features for this (state, action) pair
        features = self.extractor(state, action)

        # Compute dot product: sum of (weight * feature_value)
        q_value = sum(self.get_weight(feature) * value
                      for feature, value in features.items())

        return q_value

    def update(self, state, action, next_state, reward):
        """Update weights using least-squares approximation.
        Δ = R + γ V(s') - Q(s,a)
        Then update weights: w_i = w_i + α * Δ * f_i(s, a)
        """
        # Calculate TD error (difference between target and prediction)
        # Target: R + γ V(s')
        # Prediction: Q(s,a)
        next_value = self.get_value(next_state)
        current_q = self.get_q_value(state, action)

        td_error = reward + self.discount * next_value - current_q

        # Get features for this (state, action) pair
        features = self.extractor(state, action)

        # Update each weight: w_i ← w_i + α * Δ * f_i(s,a)
        for feature, feature_value in features.items():
            current_weight = self.get_weight(feature)
            new_weight = current_weight + self.learning_rate * td_error * feature_value
            self.weights[feature] = new_weight


# 6. Feedback
# Just an approximation is fine.
feedback_question_1 = 5

feedback_question_2 = """
The most challenging aspect was understanding the difference between model-based
(Value Iteration) and model-free (Q-Learning) approaches. The epsilon-greedy
exploration strategy and how to balance exploration vs exploitation was tricky.
"""

feedback_question_3 = """
I enjoyed seeing how Q-Learning learns through trial and error without needing
to know the full model. The approximate Q-learning with feature extraction was
particularly interesting as it shows how to scale to large state spaces.
"""