from collections import OrderedDict
from typing import Tuple, Optional, List, Dict
import random
import time
import warnings
import copy
import argparse
import shutil
import os
import os.path as osp
import math
from operator import mul
from functools import reduce

import numpy as np
import tqdm
from torch.nn import Conv2d, Dropout
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torchvision      

import utils

from clip import clip
from clip.simple_tokenizer import SimpleTokenizer as _Tokenizer

_tokenizer = _Tokenizer()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.autograd.set_detect_anomaly(True)

class PromptVisionTransformer(nn.Module):
    def __init__(self, clip_model, clip_model_type, DeepPrompt, n_vtk):
        super().__init__()
        self.input_resolution = clip_model.visual.input_resolution
        self.output_dim = clip_model.visual.output_dim
        self.conv1 = clip_model.visual.conv1
        
        self.class_embedding = clip_model.visual.class_embedding
        self.positional_embedding = clip_model.visual.positional_embedding
        self.ln_pre = clip_model.visual.ln_pre

        self.transformer = clip_model.visual.transformer

        self.ln_post = clip_model.visual.ln_post
        self.proj = clip_model.visual.proj
        self.Deep = DeepPrompt

        # prompt config
        if "ViT-B/32" in clip_model_type:
            patch_size = (32, 32)
            _, prompt_dim = self.positional_embedding.shape
            self.num_tokens = n_vtk
        elif "ViT-B/16" in clip_model_type:
            patch_size = (16, 16)
            _, prompt_dim = self.positional_embedding.shape
            self.num_tokens = n_vtk
        hidden_size = 768
        self.prompt_dropout = Dropout(0.1)
        self.prompt_proj = nn.Linear(prompt_dim, hidden_size)
        nn.init.kaiming_normal_(self.prompt_proj.weight, a=0, mode='fan_out')

        val = math.sqrt(6. / float(3 * reduce(mul, patch_size, 1) + prompt_dim))  # noqa

        self.prompt_embeddings = nn.Parameter(torch.zeros(
            1, self.num_tokens, prompt_dim))
        # xavier_uniform initialization
        nn.init.uniform_(self.prompt_embeddings.data, -val, val)

        if self.Deep:  # Deep prompt version noqa
            total_d_layer = 12-1
            self.deep_prompt_embeddings = nn.Parameter(torch.zeros(
                total_d_layer, self.num_tokens, prompt_dim))
            # xavier_uniform initialization
            nn.init.uniform_(self.deep_prompt_embeddings.data, -val, val)
        
    def forward(self, x):
        x = self.conv1(x)  # shape = [*, width, grid, grid]
        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
        x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
        #print(x.shape)
        x = x + self.positional_embedding.to(x.dtype)
        #print(self.positional_embedding.shape)
        
        # incorporate_prompt
        B = x.size(0)
        x = torch.cat((
                x[:, :1, :],
                self.prompt_dropout(self.prompt_proj(self.prompt_embeddings).expand(B, -1, -1)),
                x[:, 1:, :]
            ), dim=1)
        #print(x.shape) -> (batch_size, cls_token + n_prompt + n_patches, hidden_dim)
        x = self.ln_pre(x)
        x = x.permute(1, 0, 2)  # NLD -> LND

        if self.Deep:  # Deep prompt version
            hidden_states = None
            num_layers = self.transformer.layers

            for i in range(num_layers):
                if i == 0:
                    hidden_states = self.transformer.resblocks[i](x)
                else:
                    if i <= self.deep_prompt_embeddings.shape[0]:
                        deep_prompt_emb = self.prompt_dropout(self.prompt_proj(
                            self.deep_prompt_embeddings[i-1]).expand(B, -1, -1))
                        
                        deep_prompt_emb = deep_prompt_emb.permute(1, 0, 2)  # NLD -> LND
            
                        hidden_states = torch.cat((
                            hidden_states[:1, :, :],
                            deep_prompt_emb,
                            hidden_states[(1+self.num_tokens):, :, :]
                        ), dim=0)

                    hidden_states = self.transformer.resblocks[i](hidden_states)
            x = hidden_states
        else:
            x = self.transformer(x)
        
        x = x.permute(1, 0, 2)  # LND -> NLD
        x = self.ln_post(x[:, 0, :])

        if self.proj is not None:
            x = x @ self.proj

        return x

class TextEncoder(nn.Module):
    def __init__(self, clip_model):
        super().__init__()
        self.transformer = clip_model.transformer
        self.positional_embedding = clip_model.positional_embedding
        self.ln_final = clip_model.ln_final
        self.text_projection = clip_model.text_projection
        self.dtype = clip_model.dtype

    def forward(self, prompts, tokenized_prompts):
        x = prompts + self.positional_embedding.type(self.dtype)
        x = x.permute(1, 0, 2)  # NLD -> LND
        x = self.transformer(x)
        x = x.permute(1, 0, 2)  # LND -> NLD
        x = self.ln_final(x).type(self.dtype)

        # x.shape = [batch_size, n_ctx, transformer.width]
        # take features from the eot embedding (eot_token is the highest number in each sequence)
        x = x[torch.arange(x.shape[0]), tokenized_prompts.argmax(dim=-1)] @ self.text_projection
        return x

class DualPromptViTCLIPImageClassifier(nn.Module):
    def __init__(self, clip_model, classnames, feat_dim, device, clip_model_type="CLIPViT-B/16", DeepPrompt=False, n_vtk=5, use_csc=False, n_ctx=8):
        super(DualPromptViTCLIPImageClassifier, self).__init__()
        self.logit_scale = clip_model.logit_scale
        self.visual_backbone = PromptVisionTransformer(clip_model, clip_model_type, DeepPrompt, n_vtk)
        self.textual_backbone = CustomCLIPTextual(clip_model, classnames, device, use_csc, n_ctx)
        self._features_dim = feat_dim

        print("Turning off gradients in both the image and the text encoder")
        for name, param in self.named_parameters():
            if "prompt" not in name:
                 if ("prompt_learner" not in name):
                    param.requires_grad_(False)
        # Double check
        enabled = set()
        for name, param in self.named_parameters():
            if param.requires_grad:
                enabled.add(name)
        print(f"Parameters to be updated: {enabled}")

        for name, param in clip_model.named_parameters():
            param.requires_grad_(False)
            assert not param.requires_grad
        for name, param in self.textual_backbone.text_encoder.named_parameters():
            param.requires_grad_(False)
            assert not param.requires_grad
    
    def forward(self, images):
        image_features = self.visual_backbone(images)
        text_features = self.textual_backbone()

        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)

        logit_scale = self.logit_scale.exp()
        logits = logit_scale * image_features @ text_features.t()

        if self.training:
            return logits, image_features
        else:
            return logits
    
    @property
    def features_dim(self) -> int:
        """The dimension of features before the final `head` layer"""
        return self._features_dim
    
    def freeze_bn(self):
        for m in self.modules():
            if isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d):
                m.eval()
    
    def get_parameters(self, optimize_head=False, base_lr=1.0) -> List[Dict]:
        """A parameter list which decides optimization hyper-parameters,
            such as the relative learning rate of each layer
        """
        params = [
            {"params": self.visual_backbone.parameters(), "lr": 1.0 * base_lr},
            {"params": self.textual_backbone.prompt_learner.parameters(), "lr": 1.0 * base_lr},
        ]

        return params

class TextPromptLearner(nn.Module):
    def __init__(self, clip_model, classnames, device, use_csc, n_ctx):
        super().__init__()
        n_cls = len(classnames)
        n_ctx = n_ctx
        ctx_init = None # "a photo of" 
        dtype = clip_model.dtype
        ctx_dim = clip_model.ln_final.weight.shape[0]
        clip_imsize = clip_model.visual.input_resolution
        cfg_imsize = 224
        assert cfg_imsize == clip_imsize, f"cfg_imsize ({cfg_imsize}) must equal to clip_imsize ({clip_imsize})"
        use_csc = use_csc

        if ctx_init:
            # use given words to initialize context vectors
            ctx_init = ctx_init.replace("_", " ")
            n_ctx = len(ctx_init.split(" "))
            prompt = clip.tokenize(ctx_init).to(device)
            with torch.no_grad():
                embedding = clip_model.token_embedding(prompt).type(dtype)
            ctx_vectors = embedding[0, 1 : 1 + n_ctx, :]
            prompt_prefix = ctx_init

        else:
            # random initialization
            if use_csc:
                print("Initializing class-specific contexts")
                ctx_vectors = torch.empty(n_cls, n_ctx, ctx_dim, dtype=dtype)
            else:
                print("Initializing a generic context")
                ctx_vectors = torch.empty(n_ctx, ctx_dim, dtype=dtype)
            nn.init.normal_(ctx_vectors, std=0.02)
            prompt_prefix = " ".join(["X"] * n_ctx)

        print(f'Initial context: "{prompt_prefix}"')
        print(f"Number of context words (tokens): {n_ctx}")

        self.ctx = nn.Parameter(ctx_vectors)#.requires_grad_(False)  # to be optimized

        classnames = [name.replace("_", " ") for name in classnames]
        name_lens = [len(_tokenizer.encode(name)) for name in classnames]
        prompts = [prompt_prefix + " " + name + "." for name in classnames]

        tokenized_prompts = torch.cat([clip.tokenize(p) for p in prompts]).to(device)
        with torch.no_grad():
            embedding = clip_model.token_embedding(tokenized_prompts).type(dtype)

        # These token vectors will be saved when in save_model(),
        # but they should be ignored in load_model() as we want to use
        # those computed using the current class names
        self.register_buffer("token_prefix", embedding[:, :1, :])  # SOS
        self.register_buffer("token_suffix", embedding[:, 1 + n_ctx :, :])  # CLS, EOS

        self.n_cls = n_cls
        self.n_ctx = n_ctx
        self.tokenized_prompts = tokenized_prompts  # torch.Tensor
        self.name_lens = name_lens
        self.class_token_position = "end"

    def forward(self):
        ctx = self.ctx
        if ctx.dim() == 2:
            ctx = ctx.unsqueeze(0).expand(self.n_cls, -1, -1)

        prefix = self.token_prefix
        suffix = self.token_suffix

        if self.class_token_position == "end":
            prompts = torch.cat(
                [
                    prefix,  # (n_cls, 1, dim)
                    ctx,     # (n_cls, n_ctx, dim)
                    suffix,  # (n_cls, *, dim)
                ],
                dim=1,
            )

        elif self.class_token_position == "middle":
            half_n_ctx = self.n_ctx // 2
            prompts = []
            for i in range(self.n_cls):
                name_len = self.name_lens[i]
                prefix_i = prefix[i : i + 1, :, :]
                class_i = suffix[i : i + 1, :name_len, :]
                suffix_i = suffix[i : i + 1, name_len:, :]
                ctx_i_half1 = ctx[i : i + 1, :half_n_ctx, :]
                ctx_i_half2 = ctx[i : i + 1, half_n_ctx:, :]
                prompt = torch.cat(
                    [
                        prefix_i,     # (1, 1, dim)
                        ctx_i_half1,  # (1, n_ctx//2, dim)
                        class_i,      # (1, name_len, dim)
                        ctx_i_half2,  # (1, n_ctx//2, dim)
                        suffix_i,     # (1, *, dim)
                    ],
                    dim=1,
                )
                prompts.append(prompt)
            prompts = torch.cat(prompts, dim=0)

        elif self.class_token_position == "front":
            prompts = []
            for i in range(self.n_cls):
                name_len = self.name_lens[i]
                prefix_i = prefix[i : i + 1, :, :]
                class_i = suffix[i : i + 1, :name_len, :]
                suffix_i = suffix[i : i + 1, name_len:, :]
                ctx_i = ctx[i : i + 1, :, :]
                prompt = torch.cat(
                    [
                        prefix_i,  # (1, 1, dim)
                        class_i,   # (1, name_len, dim)
                        ctx_i,     # (1, n_ctx, dim)
                        suffix_i,  # (1, *, dim)
                    ],
                    dim=1,
                )
                prompts.append(prompt)
            prompts = torch.cat(prompts, dim=0)

        else:
            raise ValueError

        return prompts

class PromptTextual(nn.Module):
    def __init__(self, clip_model, classnames, device, use_csc, n_ctx):
        super().__init__()
        self.prompt_learner = TextPromptLearner(clip_model, classnames, device, use_csc, n_ctx)
        self.tokenized_prompts = self.prompt_learner.tokenized_prompts
        self.text_encoder = TextEncoder(clip_model)
        self.dtype = clip_model.dtype

    def forward(self, indices, mode=None):
        if mode == "scene":
            prompts = self.prompt_learner()
            tokenized_prompts = self.tokenized_prompts

            #prompts = prompts[indices]
            #tokenized_prompts = tokenized_prompts[indices]
        elif mode == "name":
            prompts = self.prompt_learner()
            tokenized_prompts = self.tokenized_prompts

        text_features = self.text_encoder(prompts, tokenized_prompts)
        text_features = text_features.type(self.dtype)
        return text_features

class DualPromptViTCLIP(nn.Module):
    def __init__(self, clip_model, classnames, feat_dim, device, clip_model_type="CLIPViT-B/16", DeepPrompt=False, n_vtk=5, use_csc=False, n_ctx=8):
        super(DualPromptViTCLIP, self).__init__()
        self.logit_scale = clip_model.logit_scale
        self.visual_backbone = PromptVisionTransformer(clip_model, clip_model_type, DeepPrompt, n_vtk)
        self.textual_backbone = PromptTextual(clip_model, classnames, device, use_csc, n_ctx)
        self._features_dim = feat_dim

        print("Turning off gradients in both the image and the text encoder")
        for name, param in self.named_parameters():
            if "prompt" not in name:
                 if ("prompt_learner" not in name):
                    param.requires_grad_(False)
        # Double check
        enabled = set()
        for name, param in self.named_parameters():
            if param.requires_grad:
                enabled.add(name)
        print(f"Parameters to be updated: {enabled}")

        for name, param in clip_model.named_parameters():
            param.requires_grad_(False)
            assert not param.requires_grad
        for name, param in self.textual_backbone.text_encoder.named_parameters():
            param.requires_grad_(False)
            assert not param.requires_grad
    
    def forward(self, images, indices=None, mode=None):
        
        image_features = self.visual_backbone(images)
        text_features = self.textual_backbone(indices, mode)

        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)

        logits_per_image = image_features @ text_features.t()
        logits_per_text = logits_per_image.t()

        if self.training:
            return logits_per_image, logits_per_text, image_features
        else:
            return logits_per_image, image_features
    
    @property
    def features_dim(self) -> int:
        """The dimension of features before the final `head` layer"""
        return self._features_dim
    
    def freeze_bn(self):
        for m in self.modules():
            if isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d):
                m.eval()
    
    def get_parameters(self, optimize_head=False, base_lr=1.0) -> List[Dict]:
        """A parameter list which decides optimization hyper-parameters,
            such as the relative learning rate of each layer
        """
        params = [
            {"params": self.visual_backbone.parameters(), "lr": 1.0 * base_lr},
            {"params": self.textual_backbone.prompt_learner.parameters(), "lr": 1.0 * base_lr},
        ]

        return params

def freeze_model(model):
    for param in model.parameters():
        param.requires_grad = False
    for module in model.modules():
        if "BatchNorm" in type(module).__name__:
            module.momentum = 0.0
    model.eval()
    return model

def convert_models_to_fp32(model):
    for p in model.parameters():
        p.data = p.data.float()
        if p.grad:
            p.grad.data = p.grad.data.float()

def seed_fix(n):
    random.seed(n)
    np.random.seed(n)
    torch.manual_seed(n)
    if torch.backends.cudnn.enabled:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

class Classifier:
    def __init__(self, checkpoint_path):
        seed_fix(777)

        # create model
        print("=> using clip_model model '{}'".format("CLIPViT-B/32"))
        clip_model, _ = clip.load("ViT-B/32", device=device)
        convert_models_to_fp32(clip_model)
        clip_model = freeze_model(clip_model)
        clip_model.eval()

        dummy_input = torch.randn([1, 3, 224, 224]).to(device)
        feature = clip_model.encode_image(dummy_input)
        feat_dim = feature.size(-1)
        class_names = ['Drawer', 'Floor', 'CounterTop', 'Mirror', 'StoveBurner', 'Cabinet', 'SideTable', 'Sink', 'GarbageBag', 
                    'CoffeeMachine', 'Toaster', 'Pot', 'Plate', 'GarbageCan', 'StoveKnob', 'Fork', 'SoapBottle', 'Fridge', 
                    'Pan', 'Window', 'Egg', 'Spatula', 'Microwave', 'Cup', 'SinkBasin', 'SaltShaker', 'PepperShaker', 
                    'ButterKnife', 'DishSponge', 'LightSwitch', 'Spoon', 'Knife', 'Shelf', 'Mug', 'DiningTable', 'Blinds', 
                    'AluminumFoil', 'Faucet', 'Bowl', 'Chair']


        self.model = DualPromptViTCLIP(clip_model, 
                            class_names,
                            feat_dim, device, 
                            clip_model_type="CLIPViT-B/32", DeepPrompt=False, n_vtk=8, use_csc=False, n_ctx=16).to(device)

        self.model.load_state_dict(torch.load(checkpoint_path))
        for name, param in self.model.visual_backbone.named_parameters():
            if param.requires_grad:
                param.requires_grad_(False)
        self.model.eval()
    def forward(self, images):
        output, _ = self.model(images, mode="name")
        output = F.sigmoid(output)
        return output

# _, preprocess = clip.load("ViT-B/32", device=device)
# from PIL import Image
# image = preprocess(Image.open("test.png")).unsqueeze(0).to(device)

# c = Classifier("/home/chois/best.pth")
# print(c.forward(image))