import pdb

import numpy as np
import os
import random
import shutil
import torch
import torch.distributed as dist
import torch.autograd as autograd
import argparse
import json
import math
import os
import random
import shutil
import time
import timm
import warnings

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as T
from sklearn.decomposition import PCA

from torch.utils.tensorboard import SummaryWriter

from PIL import ImageFilter

from tqdm import tqdm


def get_model(model):
    if isinstance(model, torch.nn.DataParallel) \
            or isinstance(model, torch.nn.parallel.DistributedDataParallel):
        return model.module
    else:
        return model


def setup_for_distributed(is_master):
    """
    This function disables printing when not in master process
    """
    import builtins as __builtin__
    builtin_print = __builtin__.print

    def print(*args, **kwargs):
        force = kwargs.pop('force', False)
        if is_master or force:
            builtin_print(*args, **kwargs)

    __builtin__.print = print


def is_dist_avail_and_initialized():
    if not dist.is_available():
        return False
    if not dist.is_initialized():
        return False
    return True


def get_world_size():
    if not is_dist_avail_and_initialized():
        return 1
    return dist.get_world_size()


def get_rank():
    if not is_dist_avail_and_initialized():
        return 0
    return dist.get_rank()


def is_main_process():
    return get_rank() == 0


def save_on_master(state, is_best, output_dir):
    if is_main_process():
        ckpt_path = f'{output_dir}/checkpoint.pt'
        best_path = f'{output_dir}/checkpoint_best.pt'
        torch.save(state, ckpt_path)
        if is_best:
            shutil.copyfile(ckpt_path, best_path)


def savelast_on_master(state, output_dir):
    if is_main_process():
        ckpt_path = f'{output_dir}/checkpoint_last.pt'
        torch.save(state, ckpt_path)


def init_distributed_mode(args):
    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
        args.rank = int(os.environ["RANK"])
        args.world_size = int(os.environ['WORLD_SIZE'])
        args.gpu = int(os.environ['LOCAL_RANK'])
    elif 'SLURM_PROCID' in os.environ:
        args.rank = int(os.environ['SLURM_PROCID'])
        args.gpu = args.rank % torch.cuda.device_count()
    else:
        print('Not using distributed mode')
        args.distributed = False
        return

    args.distributed = True

    torch.cuda.set_device(args.gpu)
    args.dist_backend = 'nccl'
    print('| distributed init (rank {}): {}'.format(
        args.rank, args.dist_url), flush=True)
    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                         world_size=args.world_size, rank=args.rank)
    torch.distributed.barrier()
    setup_for_distributed(args.rank == 0)


def scaled_all_reduce(tensors, is_scale=True):
    """Performs the scaled all_reduce operation on the provided tensors.
    The input tensors are modified in-place. Currently supports only the sum
    reduction operator. The reduced values are scaled by the inverse size of the
    world size.
    """
    world_size = get_world_size()
    # There is no need for reduction in the single-proc case
    if world_size == 1:
        return tensors
    # Queue the reductions
    reductions = []
    for tensor in tensors:
        reduction = dist.all_reduce(tensor, async_op=True)
        reductions.append(reduction)
    # Wait for reductions to finish
    for reduction in reductions:
        reduction.wait()
    # Scale the results
    if is_scale:
        for tensor in tensors:
            tensor.mul_(1.0 / world_size)
    return tensors


def all_gather_batch(tensors):
    """
    Performs all_gather operation on the provided tensors.
    """
    # Queue the gathered tensors
    world_size = get_world_size()
    # There is no need for reduction in the single-proc case
    if world_size == 1:
        return tensors
    tensor_list = []
    output_tensor = []
    for tensor in tensors:
        tensor_all = [torch.ones_like(tensor) for _ in range(world_size)]
        dist.all_gather(
            tensor_all,
            tensor,
            async_op=False  # performance opt
        )

        tensor_list.append(tensor_all)

    for tensor_all in tensor_list:
        output_tensor.append(torch.cat(tensor_all, dim=0))
    return output_tensor


class GatherLayer(autograd.Function):
    """
    Gather tensors from all workers with support for backward propagation:
    This implementation does not cut the gradients as torch.distributed.all_gather does.
    """

    @staticmethod
    def forward(ctx, x):
        output = [torch.zeros_like(x) for _ in range(dist.get_world_size())]
        dist.all_gather(output, x)
        return tuple(output)

    @staticmethod
    def backward(ctx, *grads):
        all_gradients = torch.stack(grads)
        dist.all_reduce(all_gradients)
        return all_gradients[dist.get_rank()]


def all_gather_batch_with_grad(tensors):
    """
    Performs all_gather operation on the provided tensors.
    Graph remains connected for backward grad computation.
    """
    # Queue the gathered tensors
    world_size = get_world_size()
    # There is no need for reduction in the single-proc case
    if world_size == 1:
        return tensors
    tensor_list = []
    output_tensor = []

    for tensor in tensors:
        tensor_all = GatherLayer.apply(tensor)
        tensor_list.append(tensor_all)

    for tensor_all in tensor_list:
        output_tensor.append(torch.cat(tensor_all, dim=0))
    return output_tensor


def cosine_scheduler(base_value, final_value, epochs, niter_per_ep, warmup_epochs=0, start_warmup_value=0):
    warmup_schedule = np.array([])
    warmup_iters = warmup_epochs * niter_per_ep
    if warmup_epochs > 0:
        warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters)

    iters = np.arange(epochs * niter_per_ep - warmup_iters)
    schedule = final_value + 0.5 * (base_value - final_value) * (1 + np.cos(np.pi * iters / len(iters)))

    schedule = np.concatenate((warmup_schedule, schedule))
    assert len(schedule) == epochs * niter_per_ep
    return schedule


class GaussianBlur(object):
    """Gaussian blur augmentation in SimCLR https://arxiv.org/abs/2002.05709"""

    def __init__(self, sigma=[.1, 2.]):
        self.sigma = sigma

    def __call__(self, x):
        sigma = random.uniform(self.sigma[0], self.sigma[1])
        x = x.filter(ImageFilter.GaussianBlur(radius=sigma))
        return x


class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)


class ProgressMeter(object):
    def __init__(self, num_batches, meters, prefix=""):
        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
        self.meters = meters
        self.prefix = prefix

    def display(self, batch):
        entries = [self.prefix + self.batch_fmtstr.format(batch)]
        entries += [str(meter) for meter in self.meters]
        print('\t'.join(entries))

    def _get_batch_fmtstr(self, num_batches):
        num_digits = len(str(num_batches // 1))
        fmt = '{:' + str(num_digits) + 'd}'
        return '[' + fmt + '/' + fmt.format(num_batches) + ']'


def embed_stack(embeds, labels, paths=None):
    """
    Combine embeddings/labels/paths into one torch tensor each
    """
    if type(embeds) == dict:
        for dsname in embeds.keys():
            embeds[dsname] = torch.vstack(embeds[dsname]).numpy()
            labels[dsname] = torch.concatenate(labels[dsname]).numpy()
    else:
        embeds = torch.vstack(embeds).numpy()
        labels = torch.concatenate(labels).numpy()

    if paths is not None:
        return embeds, labels, np.concatenate(paths)
    else:
        return embeds, labels


def load_features(embed_file):
    embed_dict = np.load(embed_file)
    embeds, labels, paths = embed_dict['embeds'], embed_dict['labels'], embed_dict['paths']
    embeds = torch.nn.functional.normalize(torch.Tensor(embeds), dim=1, p=2).numpy()
    return torch.Tensor(embeds), labels, paths

def feature_extract_and_save(dataloader, m, save_path, im_paths=False, device="cuda", train=False):
    embeds = []
    labels = []
    print(id(m))
    if im_paths:
        paths = []
        for im, label, path in tqdm(dataloader):
            im = im.to(device)
            paths.append(path)
            with torch.no_grad():
                try:
                    out = m.embed(im)#.squeeze()
                except:
                    out = m(im)
                embeds.append(out.to("cpu"))
                labels.append(label)
        embeds, labels, paths = embed_stack(embeds, labels, paths)
        np.savez(save_path, embeds=embeds, labels=labels, paths=paths)
        embeds = torch.nn.functional.normalize(torch.Tensor(embeds), dim=1, p=2).numpy()
        return embeds, labels, paths
    else:
        for im, label in tqdm(dataloader):
            im = im.to(device)
            with torch.no_grad():
                try:
                    out = m.embed(im)
                except:
                    out = m(im)
                if type(out) == tuple:
                    out = out[0]
                embeds.append(out.to("cpu"))
                labels.append(label)
        embeds, labels = embed_stack(embeds, labels)
        print(embeds.shape, labels.shape)
        np.savez(save_path, embeds=embeds, labels=labels)
        embeds = torch.nn.functional.normalize(torch.Tensor(embeds), dim=1, p=2).numpy()
        return embeds, labels

def rescale(x):
    x_shift = x - x.min()
    x_scale = x_shift / x_shift.max()
    return x_scale

def set_all_seeds(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)


# JSON flattening and unflattening to sidestep wandb nested dict issues
# https://github.com/wandb/wandb/issues/982
def flatten_json(json):
    if type(json) == dict:
        for k, v in list(json.items()):
            if type(v) == dict and 'distribution' not in v and 'values' not in v:
                flatten_json(v)
                json.pop(k)
                for k2, v2 in v.items():
                    json[k + "." + k2] = v2


def unflatten_json(json):
    if type(json) == dict:
        for k in sorted(json.keys(), reverse=True):
            if "." in k:
                key_parts = k.split(".")
                json1 = json
                for i in range(0, len(key_parts) - 1):
                    k1 = key_parts[i]
                    if k1 in json1:
                        json1 = json1[k1]
                        if type(json1) != dict:
                            conflicting_key = ".".join(key_parts[0:i + 1])
                            raise Exception('Key "{}" conflicts with key "{}"'.format(
                                k, conflicting_key))
                    else:
                        json2 = dict()
                        json1[k1] = json2
                        json1 = json2
                if type(json1) == dict:
                    v = json.pop(k)
                    json1[key_parts[-1]] = v

def replace_key(d, key, replacement):
    # replace all instances of substring in key with empty string
    return {k.replace(key + '.', replacement + '.'): v for k, v in d.items()}

def feature_pca(features, components=3, pcas=None, return_pca=False):
    b, c, h, w = features.shape
    features_permuted = features.detach().cpu().permute(0, 2, 3, 1)
    features_flat_batched = features_permuted.reshape(b, h * w, c)
    if pcas is None:
        # Fit and transform separate PCA per sample
        pcas = []
        for i in range(b):
            pca = PCA(n_components=components)
            pca.fit(features_flat_batched[i])
            pcas.append(pca)
    features_pca = np.array([pcas[min(i, len(pcas) - 1)].transform(features_flat_batched[i]) for i in range(b)])
    features_pca = features_pca - np.min(features_pca, axis=(1, 2), keepdims=True)
    features_pca /= np.max(features_pca, axis=(1, 2), keepdims=True)
    features_pca = features_pca.reshape(b, h, w, components)
    return (features_pca, pcas) if return_pca else features_pca

class UnNormalize(object):
    def __init__(self, mean, std):
        self.mean = mean
        self.std = std

    def __call__(self, image):
        image2 = torch.clone(image)
        if len(image2.shape) == 4:
            # batched
            image2 = image2.permute(1, 0, 2, 3)
        for t, m, s in zip(image2, self.mean, self.std):
            t.mul_(s).add_(m)
        return image2.permute(1, 0, 2, 3)


norm = T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
unnorm = UnNormalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
