from infrastructure.utils.logger import Logger
import infrastructure.utils.torch_utils as tu

import gymnasium as gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import random


"""
    The Policy/Trainer interface remains the same as in the first assignment:
"""

class Policy:
    def __init__(self, *args, **kwargs):
        raise NotImplementedError()

    # Should sample an action from the policy in the given state
    def play(self, state : int, *args, **kwargs) -> int:
        raise NotImplementedError()

    # Should return the predicted Q-values for the given state
    def raw(self, state: int, *args, **kwargs) -> torch.Tensor:
        raise NotImplementedError()


class Trainer:
    def __init__(self, env, *args, **kwargs):
        self.env = env

    # `gamma` is the discount factor
    # `steps` is the total number of calls to env.step()
    def train(self, gamma : float, steps : int, *args, **kwargs) -> Policy:
        raise NotImplementedError()


class ReplayBuffer:
    """
        Example implementation of a simple replay buffer.
        You will need to modify this, especially to support n-step updates.

        Important: You are free to modify this class or provide your own. 
        It is not part of the required interface.
    """

    def __init__(self, capacity):
        self.idx = 0
        self.capacity = capacity

        # s, a, r, s', done
        self.transitions = []

    def insert(self, transition):
        """
            Insert a transition into the replay buffer.

            `transition` is a tuple (s, a, r, s', done)
        """
        if len(self.transitions) < self.capacity:
            self.transitions.append(transition)
        else:
            self.transitions[self.idx] = transition
            self.idx = (self.idx + 1) % self.capacity

    def sample(self, batch_size):
        """
            Return a batch of `batch_size` transitions.
        """
        if batch_size > len(self.transitions):
            raise RuntimeError("Not enough transitions in replay buffer.")

        batch = random.sample(self.transitions, batch_size)

        return batch
    
    def __len__(self):
        return len(self.transitions)


"""
    The goal in the second assignment is to implement your own DQN agent, along with
    some additional features. The mandatory ones include:

    1) Target network for bootstrapping
    2) Double DQN
    3) N-step returns for calculating the target
    4) Scheduling of the epsilon parameter over time
    
    
    DISCLAIMER:
    
    All the provided code is just a template that can help you get started and 
    is not mandatory to use. You only need to stick to the interface and the
    method signatures of the constructor and `train` for DQNTrainer.

    Some of the extensions above can be implemented in multiple ways - 
    like exponential averaging vs hard updates for the target net.
    You can choose either, or even experiment with both.
"""


class DQNNet(nn.Module):
    def __init__(self, input_size, output_size, hidden_size=86):
        super(DQNNet, self).__init__()

        # Dummy layer to prevent errors
        self.dummy_layer = nn.Linear(1, 1)

        # TODO: Implement the network architecture - see torch.nn layers.


    def forward(self, x):
        # TODO: implement the forward pass, see torch.nn.functional 
        # for common activation functions. 

        # Dummy return value to prevent errors
        return torch.zeros(2)

    @torch.no_grad()
    def play(self, obs, eps=0.0):

        qvals = self(obs)
        if np.random.rand() <= eps:
            return np.random.choice(len(qvals))

        # You can also randomly break ties here.
        x = torch.argmax(qvals)

        # Cast from tensor to int so gym does not complain
        return int(x)


class DQNPolicy(Policy):
    def __init__(self, net : DQNNet):
        self.net = net

    def play(self, state):
        return self.net.play(state)

    def raw(self, state: int) -> torch.Tensor:
        return self.net(state)


class DQNTrainer(Trainer):
    DQN = "DQN"
    DQN_TARGET = "DQN+target"
    DOUBLE_DQN = "DoubleDQN"

    def __init__(
            self, env, state_dim, num_actions,
            # TODO: Find good hyperparameters working for all three environments and set them as default values.
            # During the grading, we will test your implementation on your own default hyperparameters.
            lr=0.01, mini_batch=64, max_buffer_size=10000, n_steps=1,
            initial_eps=1.0, final_eps=0.1, mode=DQN,
            **kwargs
        ) -> None:
        super(DQNTrainer, self).__init__(env)
        """
            Initialize the DQNTrainer
            
            Args:
                env: The environment to train on
                state_dim: The dimension of the state space
                num_actions: The number of actions in the action space
                lr: The learning rate
                mini_batch: The mini batch size
                max_buffer_size: The maximum replay buffer size
                n_steps: The number of steps to look ahead when calculating targets
                initial_eps: The initial epsilon value for epsilon-greedy exploration
                final_eps: The final epsilon value for epsilon-greedy exploration
                mode: The mode of operation. Can be "DQN", "DQN+target", "DoubleDQN"
        """

        # Initialize the trainable net
        self.net = DQNNet(state_dim, num_actions)

        # Initialize the target net as a copy of the main net
        self.target_net = DQNNet(state_dim, num_actions)
        self.target_net.load_state_dict(self.net.state_dict())

        # Initialize the optimizer
        self.optimizer = optim.Adam(self.net.parameters(), lr=lr)

        # Initialize the buffer
        self.buffer = ReplayBuffer(max_buffer_size)

        # TODO: Initialize other necessary variables


    """
        You can modify or even remove the methods `loss_fn`, `calculate_targets` and `update_net`.
        They serve mostly as an example of how learning works in pytorch.
    """


    def loss_fn(self, qvals, target_qvals):
        """
            Calculate loss on a batch of Q-values Q(s,a) and a batch of
            targets. 

            You can use an appropriate torch.nn loss.
        """
        pass


    def calculate_targets(self, transition_batch):
        """
            Recall the constructor arguments `mode` and `n_steps`
            and how they influence the target calculation.
        """
        # Here are some tensor operations which might be useful:

        states = [ torch.tensor([1.0]), torch.tensor([1.2]) ]
        actions = torch.tensor([ 0, 1 ])

        print("Concat tensors along new dimension:")
        state_batch = torch.stack(states)
        print(states)
        print(state_batch)

        print("Insert a new dimension:")
        action_batch = actions.unsqueeze(1)
        print(actions)
        print(action_batch, end="\n\n")

        # Once you implement the neural net, you can pass a batch of inputs to
        # the model like so:
        q_values = self.net(state_batch)

        print("Selecting elements from:")
        some_data = torch.tensor([[1,2], [4,3]])
        print(some_data)

        print("Select indices 0 & 1 along dimension 1")
        selected_elems = some_data.gather(1, action_batch)
        print(selected_elems, "\n")

        print("Indices of maximal elements in each row")
        selected_elems = some_data.argmax(dim=1, keepdim=True)
        print(selected_elems)

        print("And their values:")
        print(some_data.gather(1, selected_elems))

        return torch.tensor(42)


    def update_net(self, *args):
        """
            Update of the main net parameters:

            1) Calculate gradient estimate from the batch
            2) Do a single step of gradient descent using this estimate
        """

        # TODO: calculate these values
        qvals = ...

        target_qvals = self.calculate_targets([])

        # Define the loss function
        loss = self.loss_fn(qvals, target_qvals)

        """
            ALWAYS call the following three methods in this order,

            1) Zero saved gradients in optimizer
            2) Calculate gradient of the loss
            3) Perform an optimization step
        """
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        
    def train(self, gamma, train_time_steps) -> DQNPolicy:
        """
            TODO: Interact with the environment through the methods
            `env.reset()` and `env.step(action)`
        """
        
        state, _ = self.env.reset()

        # You need to cast states from numpy arrays to torch tensors if you want to pass
        # them to your neural net. You can use the provided utilities for this
        state = tu.to_torch(state)
        eps = 1.0

        step = 0

        while step < train_time_steps:
            done = False

            while not done and step < train_time_steps:

                action = self.net.play(state, eps)
                succ, rew, terminated, truncated, _ = self.env.step(action)

                """
                    TODO: 
                        1) Save the transition into the replay buffer.
                        2) Sample a minibatch from the buffer
                        3) Update the main network
                        4) (Possibly) update the target network as well.
                """

                transition = ...

                self.buffer.insert(transition)

                if len(self.buffer) >= 42:
                    batch = self.buffer.sample(batch_size=42)

                step += 1

                if terminated or truncated:
                    done = True

        return DQNPolicy(self.net)


"""
    Helper function to get dimensions of state/action spaces of gym environments.
"""
def get_env_dimensions(env):

    def get_space_dimensions(space):
        if isinstance(space, gym.spaces.Discrete):
            return space.n
        elif isinstance(space, gym.spaces.Box):
            return space.shape[0]
        else:
            raise TypeError(f"Space type {type(space)} in get_dimensions not recognized, not an instance of Discrete/Box")

    state_dim = get_space_dimensions(env.observation_space)
    num_actions = get_space_dimensions(env.action_space)

    return state_dim, num_actions


"""
    Demonstration code - get states/actions, play randomly
"""
def example_human_eval(env_name):
    env = gym.make(env_name)
    state_dim, num_actions = get_env_dimensions(env)

    trainer = DQNTrainer(env, state_dim, num_actions)
    # Tensor operations example
    trainer.calculate_targets([])

    # Train the agent on 1000 steps.
    pol = trainer.train(0.99, 1000)

    # Visualize the policy for 10 episodes
    human_env = gym.make(env_name, render_mode="human")
    for _ in range(10):
        state = human_env.reset()[0]
        done = False
        while not done:
            action = pol.play(tu.to_torch(state))
            state, _, done, _, _ = human_env.step(action)


if __name__ == "__main__":
    # Evaluate your algorithm on the following three environments
    env_names = ["CartPole-v1", "Acrobot-v1", "LunarLander-v2"]
    example_human_eval(env_names[0])