from collections import OrderedDict
from typing import Tuple, Optional, List, Dict
import math
from operator import mul
from functools import reduce
import copy

import numpy as np
import torch
import torch.nn as nn
from torch.nn import Conv2d, Dropout

from clip import clip
from clip.simple_tokenizer import SimpleTokenizer as _Tokenizer

_tokenizer = _Tokenizer()

class PromptVisionTransformer(nn.Module):
    def __init__(self, clip_model, device, clip_model_type, DeepPrompt, n_vtk):
        super().__init__()
        self.device = device
        self.input_resolution = clip_model.visual.input_resolution
        self.output_dim = clip_model.visual.output_dim
        self.conv1 = clip_model.visual.conv1
        
        self.class_embedding = clip_model.visual.class_embedding
        self.positional_embedding = clip_model.visual.positional_embedding
        self.ln_pre = clip_model.visual.ln_pre

        self.transformer = clip_model.visual.transformer

        self.ln_post = clip_model.visual.ln_post
        self.proj = clip_model.visual.proj
        self.Deep = DeepPrompt
        self.n_vtk = n_vtk

        # prompt config
        if "ViT-B/32" in clip_model_type:
            self.patch_size = (32, 32)
            _, self.prompt_dim = self.positional_embedding.shape
            self.num_tokens = n_vtk
        elif "ViT-B/16" in clip_model_type:
            self.patch_size = (16, 16)
            _, self.prompt_dim = self.positional_embedding.shape
            self.num_tokens = n_vtk
        self.hidden_size = 768
        self.prompt_dropout = Dropout(0.1)

        self.attn_list = []

    def source_prompt_init(self, source_prompts_file, multi_p_mode):
        self.prompt_num = len(source_prompts_file)
        self.multi_p_mode = multi_p_mode
        self.source_prompt_list = nn.ParameterList([])
        self.source_prompt_attn_weight_list = nn.ModuleList([])
        for i in range(self.prompt_num):
            parameters = torch.load(source_prompts_file[i])
            if "net" in list(parameters.keys())[0]:
                key_prompt_embeddings = "net.visual_backbone.prompt_embeddings"
                key_prompt_proj = "net.visual_backbone.prompt_proj"
            else:
                key_prompt_embeddings = "visual_backbone.prompt_embeddings"
                key_prompt_proj = "visual_backbone.prompt_proj"
            with torch.no_grad():
                # prompt embeddings
                source_prompt = parameters[key_prompt_embeddings].clone()
                source_prompt_embeddings = nn.Parameter(source_prompt).requires_grad_(False)
                source_prompt_embeddings.to(self.device)
                assert (source_prompt_embeddings.to(self.device) == parameters[key_prompt_embeddings]).all()
                # prompt proj
                source_prompt_proj = torch.nn.Linear(self.prompt_dim, self.hidden_size)
                source_prompt_proj.weight.copy_(parameters[key_prompt_proj+".weight"].clone())
                source_prompt_proj.bias.copy_(parameters[key_prompt_proj+".bias"].clone())
                source_prompt_proj.requires_grad_(False)
                source_prompt_proj.to(self.device)
                assert (source_prompt_proj.bias.to(self.device) == parameters[key_prompt_proj+".bias"]).all()
                prompt = self.incorporate_prompt(source_prompt_embeddings, source_prompt_proj)
                self.source_prompt_list.append(prompt)
        self.source_prompt_list.requires_grad_(False)

    def meta_prompt_init(self,):
        self.meta_prompt_proj = nn.Linear(self.prompt_dim, self.hidden_size)
        nn.init.kaiming_normal_(self.meta_prompt_proj.weight, a=0, mode='fan_out')

        val = math.sqrt(6. / float(3 * reduce(mul, self.patch_size, 1) + self.prompt_dim))  # noqa

        self.meta_prompt_embeddings = nn.Parameter(torch.zeros(
            1, self.num_tokens, self.prompt_dim))
        # xavier_uniform initialization
        nn.init.uniform_(self.meta_prompt_embeddings.data, -val, val)

        if self.Deep:  # Deep prompt version noqa
            total_d_layer = 12-1
            self.deep_meta_prompt_embeddings = nn.Parameter(torch.zeros(
                total_d_layer, self.num_tokens, self.prompt_dim))
            # xavier_uniform initialization
            nn.init.uniform_(self.deep_meta_prompt_embeddings.data, -val, val)

    def incorporate_prompt(self, prompt_embeddings, prompt_proj=None, x=None):
        if x is not None:
            if prompt_proj is not None:
                B = x.size(0)
                x = torch.cat((
                        x[:, :1, :],
                        self.prompt_dropout(prompt_proj(prompt_embeddings).expand(B, -1, -1)),
                        x[:, 1:, :]
                    ), dim=1)
                #print(x.shape) -> (batch_size, cls_token + n_prompt + n_patches, hidden_dim)
                x = self.ln_pre(x)
            else:
                B = x.size(0)
                x = torch.cat((
                        x[:, :1, :],
                        prompt_embeddings.expand(B, -1, -1),
                        x[:, 1:, :]
                    ), dim=1)
                #print(x.shape) -> (batch_size, cls_token + n_prompt + n_patches, hidden_dim)
                x = self.ln_pre(x)
        else:
            x = self.prompt_dropout(prompt_proj(prompt_embeddings))
        
        return x

    def post_prompt_forward(self, X):
        P_list = []
        for prompt in self.source_prompt_list:
            P = self.incorporate_prompt(prompt, x=X) # torch.Size([B, (50+8), hidden_size])
            P_list.append(P)
        B, L, D = P.shape
        
        multi_P = torch.cat(P_list, dim=1).view(B*self.prompt_num, L, D) #torch.Size([B*prompt_num, (50+8), hidden_size])
        return multi_P

    def forward(self, x, mode):
        with torch.no_grad():
            x = self.conv1(x)  # shape = [*, width, grid, grid]
            x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
            x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
            x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
            x = x + self.positional_embedding.to(x.dtype)
        # incorporate_prompt
        if mode:
            x = self.incorporate_prompt(self.meta_prompt_embeddings, self.meta_prompt_proj, x)
        elif self.multi_p_mode[0] == "ENSEMBLE":
            # post prompt attention: making prompted input instance
            x = self.post_prompt_forward(x)
        else:
            raise NotImplementedError
        
        x = x.permute(1, 0, 2)  # NLD -> LND

        x = self.transformer(x)
        
        x = x.permute(1, 0, 2)  # LND -> NLD
        x = self.ln_post(x[:, 0, :])

        if self.proj is not None:
            x = x @ self.proj
        
        return x

class CONPEMultiVisualPromptTuningCLIP(nn.Module):
    def __init__(self, clip_model, device, clip_model_type="CLIPViT-B/32", DeepPrompt=False, n_vtk=8):
        super(CONPEMultiVisualPromptTuningCLIP, self).__init__()
        self.visual_backbone = PromptVisionTransformer(clip_model, device, clip_model_type, DeepPrompt, n_vtk)
        self.logit_scale = clip_model.logit_scale

    def prompt_init(self, source_prompts_file, multi_p_mode="ENSEMBLE", meta_mode=False):
        self.visual_backbone.source_prompt_init(source_prompts_file, multi_p_mode)
        if meta_mode:
            self.visual_backbone.meta_prompt_init()

    def forward(self, images, mode=False):
        image_features = self.visual_backbone(images, mode)
        return image_features