import os
import numpy as np
import torch
import torch.nn.functional as F
from MindVideo import create_Wen_dataset
import argparse
import datetime
import wandb
import torchvision.transforms as transforms
from einops import rearrange
from PIL import Image
import numbers
from omegaconf import OmegaConf
from typing import Dict, Optional, Tuple
from accelerate import Accelerator
from accelerate.utils import set_seed
import inspect
from MindVideo import UNet3DConditionModel
from MindVideo import TuneAVideoPipeline
from diffusers import AutoencoderKL, DDPMScheduler, DDIMScheduler
from diffusers.utils.import_utils import is_xformers_available
from diffusers.optimization import get_scheduler
import math 
from tqdm.auto import tqdm
from MindVideo import save_videos_grid
from accelerate.utils import DistributedDataParallelKwargs,InitProcessGroupKwargs
from datetime import timedelta
from transformers import CLIPTextModel, CLIPTokenizer
from torch.utils.data import Sampler
import nlpaug.augmenter.word as naw
import nlpaug.flow as naf


def to_image(img):
    if img.shape[-1] != 3:
        img = rearrange(img, 'c h w -> h w c')
    img = 255. * img
    return Image.fromarray(img.astype(np.uint8))

def channel_last(img):
    if img.shape[-1] == 3:
        return img
    if len(img.shape) == 3:
        img = rearrange(img, 'c h w -> h w c')
    elif len(img.shape) == 4:
        img = rearrange(img, 'f c h w -> f h w c')
    else:
        raise ValueError(f'img shape should be 3 or 4, but got {len(img.shape)}')
    return img

def channel_first(img):
    if len(img.shape) == 3:
        if img.shape[0] == 3:
            return img
        img = rearrange(img, 'h w c -> c h w')
    elif len(img.shape) == 4:
        if img.shape[1] == 3:
            return img
        img = rearrange(img, 'f h w c -> f c h w')
    else:
        raise ValueError(f'img shape should be 3 or 4, but got {len(img.shape)}')
    return img


def normalize(img):
    if img.shape[-1] == 3 and len(img.shape) == 3:
        img = rearrange(img, 'h w c -> c h w')
    elif img.shape[-1] == 3 and len(img.shape) == 4:
        img = rearrange(img, 'f h w c -> f c h w')
    img = torch.tensor(img)
    img = img * 2.0 - 1.0 # to -1 ~ 1
    return img


def crop(clip, i, j, h, w):
    """
    Args:
        clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W)
    """
    assert len(clip.size()) == 4, "clip should be a 4D tensor"
    return clip[..., i:i + h, j:j + w]

class RandomCropVideo(transforms.RandomCrop):
    def __init__(self, size):
        if isinstance(size, numbers.Number):
            self.size = (int(size), int(size))
        else:
            self.size = size

    def __call__(self, clip):
        """
        Args:
            clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W)
        Returns:
            torch.tensor: randomly cropped/resized video clip.
                size is (C, T, OH, OW)
        """
        i, j, h, w = self.get_params(clip, self.size)
        return crop(clip, i, j, h, w)

    def __repr__(self):
        return self.__class__.__name__ + '(size={0})'.format(self.size)

class random_crop:
    def __init__(self, h, w, p):
        self.h = h
        self.w = w
        self.p = p
    def __call__(self, img):
        if torch.rand(1) < self.p:
            return RandomCropVideo(size=(self.h, self.w))(img)
        return img

class TestSampler(Sampler):
    def __init__(self, data_source, seed=0, shuffle=True, num_samples=None):
        self.data_source = data_source
        self.time = data_source.t
        self.video_num = len(data_source) // self.time
        self.seed = seed
        self.shuffle = shuffle
        self.num_samples = num_samples

    def __iter__(self):
        # Create a list of indices
        indices = np.arange(len(self.data_source)).reshape(self.video_num, self.time)
        # Shuffle the indices
        if self.shuffle:
            rng = np.random.RandomState(self.seed)
            x_idx = rng.permutation(np.arange(self.video_num))
            y_idx = rng.permutation(np.arange(self.time))
            indices = indices[x_idx, :]
            indices = indices[:, y_idx]
        indices = indices.transpose(1, 0).reshape(-1)
        # Yield the indices
        indices = indices[:self.num_samples] if self.num_samples is not None else indices
        for idx in indices:
            yield idx

    def __len__(self):
        if self.num_samples is not None:
            return self.num_samples
        return len(self.data_source) 

def concat_text(texts):
    nonrepeat = texts[0]
    for text in texts[1:]:
        if text not in nonrepeat:
            nonrepeat = '. Then, '.join([nonrepeat, text])
    return nonrepeat

def main(  
        train_data_setting: Dict, 
        val_data_setting: Dict,
        pretrained_model_path: str='runwayml/stable-diffusion-v1-5',
        data_dir: str='./data',
        seed: int=2023,
        dataset: str='Wen',
        patch_size: int=16,
        subjects: list=['subject1'],
        gradient_accumulation_steps: int=1,
        mixed_precision: str='fp16',
        random_crop_prob: float=0.5,
        working_dir: str='.',
        cache_dir: str='./.cache',
        resume_from_checkpoint: Optional[str] = None,
        enable_xformers_memory_efficient_attention: bool = True,
        gradient_checkpointing: bool = True,
        learning_rate: float = 5.3e-5,
        adam_beta1: float = 0.9,
        adam_beta2: float = 0.999,
        adam_weight_decay: float = 1e-2,
        adam_epsilon: float = 1e-8,
        train_batch_size: int = 4,
        eval_batch_size: int = 4,
        lr_scheduler: str = "constant",
        lr_warmup_steps: int = 0,
        max_train_steps: int = 500,
        output_path: Optional[str] = None,
        max_grad_norm: float = 1.0,
        checkpointing_steps: int = 500,
        validation_steps: int = 100,
        group_name: str = 'default',
        include_test_video: bool = False,
        allow_tf32: bool = False,
        snr_gamma: float = 5.0,
        scale_lr: bool = False,
        **kwargs
):  
    # project setup
    *_, config = inspect.getargvalues(inspect.currentframe())
    device = torch.device(f'cuda') if torch.cuda.is_available() else torch.device('cpu')
    set_seed(seed)

    kwargs = [DistributedDataParallelKwargs(find_unused_parameters=True, static_graph=True),
              InitProcessGroupKwargs(timeout=timedelta(minutes=120))]
    accelerator = Accelerator(
        gradient_accumulation_steps=gradient_accumulation_steps,
        mixed_precision=mixed_precision,
        log_with="wandb",
        kwargs_handlers=kwargs,
    )

    if accelerator.is_main_process:
        output_path = os.path.join(working_dir, 'results', 'video_tune', '%s'%(datetime.datetime.now().strftime("%d-%m-%Y-%H:%M:%S"))) if output_path is None else output_path
        os.makedirs(output_path, exist_ok=True)
        accelerator.init_trackers(
            "fmri-reconst-movie",
            config=config,
            init_kwargs={
            "wandb": {
                "notes": 'this tune the stable diffusion with videos',
                "group": group_name,
                "reinit": True,
                "anonymous": "allow",
                }
            },

        )
        OmegaConf.save(config, os.path.join(output_path, 'config.yaml'))
    else:
        output_path = os.path.join(working_dir, 'results', 'video_tune') if output_path is None else output_path
    crop_ratio = train_data_setting.crop_ratio
    h = train_data_setting.height
    w = train_data_setting.width
    resize_img_size = train_data_setting.resize_img_size

    h_crop_pix = int(crop_ratio*h)
    w_crop_pix = int(crop_ratio*w)
    eval_samples = val_data_setting.eval_samples
    fps = val_data_setting.video_length // 2
    print(f'eval_samples: {eval_samples}')
    img_transform_train = transforms.Compose([
        normalize,
        random_crop(h-h_crop_pix, w-w_crop_pix, p=random_crop_prob),
        transforms.Resize((resize_img_size, resize_img_size)), 
        channel_first
    ])
    img_transform_test = transforms.Compose([
        normalize, transforms.Resize((resize_img_size, resize_img_size)), 
        channel_first
    ])

    # define text transform
    text_aug = naf.Sometimes([
        naw.SynonymAug(),
        naw.RandomWordAug(action='swap'),
    ], aug_p=0.5)

    if dataset == 'Wen':
        dataset_train, dataset_test = create_Wen_dataset(data_dir, patch_size, 
                fmri_transform=torch.FloatTensor, image_transform=[img_transform_train, img_transform_test], 
                subjects=subjects, fps=fps)
        num_voxels = dataset_train.num_voxels
        if include_test_video:
            dataset_train.fmri = np.concatenate([dataset_train.fmri, dataset_test.fmri], axis=0)
            dataset_train.video = np.concatenate([dataset_train.video, dataset_test.video], axis=0)
            dataset_train.text = np.concatenate([dataset_train.text, dataset_test.text], axis=0)
            print(f'Adding {len(dataset_test)} test samples to training set.')
    else:
        raise NotImplementedError(f'{dataset} not implemented')


    # Load scheduler, tokenizer and models.
    noise_scheduler = DDPMScheduler.from_pretrained(pretrained_model_path, subfolder="scheduler")
    vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae")
    unet = UNet3DConditionModel.from_pretrained_2d(pretrained_model_path, subfolder="unet")
    
    tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_path, subfolder="tokenizer")
    text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder")

    vae.requires_grad_(False)
    # unet.requires_grad_(False)
    text_encoder.requires_grad_(False)

    if enable_xformers_memory_efficient_attention:
        if is_xformers_available():
            unet.enable_xformers_memory_efficient_attention()
        else:
            raise ValueError("xformers is not available. Make sure it is installed correctly")

    if gradient_checkpointing:
        unet.enable_gradient_checkpointing()

    if allow_tf32:
        torch.backends.cuda.matmul.allow_tf32 = True
    if scale_lr:
        learning_rate = (
                learning_rate * gradient_accumulation_steps * train_batch_size * accelerator.num_processes
            )
    
    optimizer = torch.optim.AdamW(
        unet.parameters(),
        lr=learning_rate,
        betas=(adam_beta1, adam_beta2),
        weight_decay=adam_weight_decay,
        eps=adam_epsilon,
    )

    if accelerator.is_main_process:
        wandb.watch(unet, log="all", log_freq=50)
    print('watch model')
    # Get the validation pipeline
    validation_pipeline = TuneAVideoPipeline(
        vae=vae, unet=unet, tokenizer=tokenizer, text_encoder=text_encoder,
        scheduler=DDIMScheduler.from_pretrained(pretrained_model_path, subfolder="scheduler")
    )
    validation_pipeline.enable_vae_slicing()
    print('validation pipeline created')
    # DataLoaders creation:
    train_dataloader = torch.utils.data.DataLoader(
        dataset_train, batch_size=train_batch_size, shuffle=True
    )
    test_sampler = TestSampler(dataset_test, seed=seed, shuffle=True, num_samples=eval_samples)
    eval_dataloader = torch.utils.data.DataLoader(
        dataset_test, batch_size=eval_batch_size, sampler=test_sampler
    )
    print('dataloader created')
    # Scheduler
    lr_scheduler = get_scheduler(
        lr_scheduler,
        optimizer=optimizer,
        num_warmup_steps=lr_warmup_steps * gradient_accumulation_steps,
        num_training_steps=max_train_steps * gradient_accumulation_steps,
    )

    # Prepare everything with our `accelerator`.
    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
        unet, optimizer, train_dataloader, lr_scheduler
    )
    print('accelerator prepared')
    # For mixed precision training we cast the text_encoder and vae weights to half-precision
    # as these models are only used for inference, keeping weights in full precision is not required.
    weight_dtype = torch.float32
    if accelerator.mixed_precision == "fp16":
        weight_dtype = torch.float16
    elif accelerator.mixed_precision == "bf16":
        weight_dtype = torch.bfloat16

    text_encoder.to(accelerator.device, dtype=weight_dtype)
    vae.to(accelerator.device, dtype=weight_dtype)
    print('model moved')

    def compute_snr(timesteps):
        """
        Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
        """
        alphas_cumprod = noise_scheduler.alphas_cumprod
        sqrt_alphas_cumprod = alphas_cumprod**0.5
        sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5

        # Expand the tensors.
        # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
        sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
        while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
            sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
        alpha = sqrt_alphas_cumprod.expand(timesteps.shape)

        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
        while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
            sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
        sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)

        # Compute SNR.
        snr = (alpha / sigma) ** 2
        return snr
    
    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
    # Afterwards we recalculate our number of training epochs
    num_train_epochs = math.ceil(max_train_steps / num_update_steps_per_epoch)
    
    # save groundtruth
    if accelerator.is_main_process: 
        table = []
        for idx, prompt in enumerate(eval_dataloader):
            video = prompt['image']
            text = [concat_text([t[0] for t in prompt["text"]])] 
            out = save_videos_grid(rearrange(video, 'b t c h w -> b c t h w'), f"{output_path}/samples/sample-gt/test{idx+1}.gif", 
                                rescale=True, fps=fps)
            out = rearrange(np.stack(out), 't h w c -> t c h w') / 255.
            out = F.interpolate(torch.from_numpy(out), size=(128, 128), mode='bilinear', align_corners=False)
            # accelerator.log({
            #     f"test{idx+1}": wandb.Video((out* 255).numpy().astype(np.uint8), fps=3, format="gif") 
            # })
            table.append(
                [f"Groundtruth {idx+1}", text[0], wandb.Video((out * 255).numpy().astype(np.uint8), fps=fps, format="gif")],
            )
        accelerator.log({"Groundtruth": wandb.Table(data=table, columns=["ID", "Text", "Video"])})
        
    print('groundtruth saved')
    # Train!
    total_batch_size = train_batch_size * accelerator.num_processes * gradient_accumulation_steps
    print("***** Running training *****")
    print(f"  Num examples = {len(dataset_train)}")
    print(f"  Num Epochs = {num_train_epochs}")
    print(f"  Instantaneous batch size per device = {train_batch_size}")
    print(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
    print(f"  Gradient Accumulation steps = {gradient_accumulation_steps}")
    print(f"  Total optimization steps = {max_train_steps}")
    global_step = 0
    first_epoch = 0    
    
    # Potentially load in the weights and states from a previous save
    if resume_from_checkpoint:
        if resume_from_checkpoint != "latest":
            path = os.path.basename(resume_from_checkpoint)
        else:
            # Get the most recent checkpoint
            dirs = os.listdir(output_path)
            dirs = [d for d in dirs if d.startswith("checkpoint")]
            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
            path = dirs[-1]
        accelerator.print(f"Resuming from checkpoint {path}")
        accelerator.load_state(os.path.join(output_path, path))
        global_step = int(path.split("-")[1])
        first_epoch = global_step // num_update_steps_per_epoch
        resume_step = global_step % num_update_steps_per_epoch

    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(global_step, int(max_train_steps)), disable=not accelerator.is_local_main_process)
    progress_bar.set_description("Steps")

    for epoch in range(first_epoch, num_train_epochs):
        unet.train()
        train_loss = 0.0
        train_loss_list = []
        for step, batch in enumerate(train_dataloader):
            # Skip steps until we reach the resumed step
            if resume_from_checkpoint and epoch == first_epoch and step < resume_step:
                if step % gradient_accumulation_steps == 0:
                    progress_bar.update(1)
                continue

            with accelerator.accumulate(unet):
                # Convert videos to latent space
                pixel_values = batch["image"].to(weight_dtype)
                video_length = pixel_values.shape[1]
                pixel_values = rearrange(pixel_values, "b f c h w -> (b f) c h w")
                latents = vae.encode(pixel_values).latent_dist.sample()
                latents = rearrange(latents, "(b f) c h w -> b c f h w", f=video_length)
                latents = latents * 0.18215

                # Sample noise that we'll add to the latents
                noise = torch.randn_like(latents)
                bsz = latents.shape[0]
                # Sample a random timestep for each video
                timesteps = torch.randint(0, noise_scheduler.num_train_timesteps, (bsz,), device=latents.device)
                timesteps = timesteps.long()

                # Add noise to the latents according to the noise magnitude at each timestep
                # (this is the forward diffusion process)
                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
                text = [concat_text([t[b] for t in batch["text"]]) for b in range(bsz)] 

                text = text_aug.augment(text, n=len(text))
                prompt_ids = tokenizer(
                    text, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
                ).input_ids
                encoder_hidden_states = text_encoder(prompt_ids.to(device)).last_hidden_state

                # Get the target for loss depending on the prediction type
                if noise_scheduler.prediction_type == "epsilon":
                    target = noise
                elif noise_scheduler.prediction_type == "v_prediction":
                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
                else:
                    raise ValueError(f"Unknown prediction type {noise_scheduler.prediction_type}")

                # print(encoder_hidden_states.shape)
                # Predict the noise residual and compute loss
                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
                # loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
                # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
                # Since we predict the noise instead of x_0, the original formulation is slightly changed.
                # This is discussed in Section 4.2 of the same paper.
                snr = compute_snr(timesteps)
                mse_loss_weights = (
                    torch.stack([snr, snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0] / snr
                )
                # We first calculate the original loss. Then we mean over the non-batch dimensions and
                # rebalance the sample-wise losses with their respective loss weights.
                # Finally, we take the mean of the rebalanced loss.
                loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
                loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
                loss = loss.mean()

                # Gather the losses across all processes for logging (if we use distributed training).
                avg_loss = accelerator.gather(loss.repeat(train_batch_size)).mean()
                train_loss += avg_loss.item() / gradient_accumulation_steps

                # Backpropagate
                accelerator.backward(loss)
                if accelerator.sync_gradients:
                    accelerator.clip_grad_norm_(unet.parameters(), max_grad_norm)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

            # Checks if the accelerator has performed an optimization step behind the scenes
            if accelerator.sync_gradients:
                progress_bar.update(1)
                global_step += 1
                train_loss_list.append(train_loss)
                # accelerator.log({"train_loss": train_loss}, step=global_step)
                train_loss = 0.0

                if global_step % checkpointing_steps == 0:
                    if accelerator.is_main_process:
                        save_path = os.path.join(output_path, f"checkpoint-{global_step}")
                        accelerator.save_state(save_path)
                        print(f"Saved state to {save_path}")

                if global_step % validation_steps == 0:
                    if accelerator.is_main_process:
                        samples = []
                        generator = torch.Generator(device=latents.device)
                        generator.manual_seed(seed)

                        for idx, prompt in enumerate(eval_dataloader):
                            bsz = prompt["image"].shape[0]
                            text = [concat_text([t[b] for t in prompt["text"]]) for b in range(bsz)] 
                            sample = validation_pipeline(prompt=text, generator=generator,
                                                         **val_data_setting).videos
                            out = save_videos_grid(sample, f"{output_path}/samples/sample-{global_step}/test{idx+1}.gif", fps=fps)
                            samples.append(sample)
                            out = rearrange(np.stack(out), 't h w c -> t c h w') / 255.
                            out = F.interpolate(torch.from_numpy(out), size=(128, 128), mode='bilinear', align_corners=False)
                            accelerator.log({
                                f"test{idx+1}": wandb.Video((out* 255).numpy().astype(np.uint8), fps=fps, format="gif") 
                            })
                        samples = torch.concat(samples)
                        save_path = f"{output_path}/samples/sample-{global_step}.gif"
                        save_videos_grid(samples, save_path)
                        print(f"Saved samples to {save_path}")

            logs = {'epoch': epoch, "step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
            progress_bar.set_postfix(**logs)

            if global_step >= max_train_steps:
                break
        accelerator.log({"train_loss": np.mean(train_loss_list)})
        accelerator.log({"step": global_step})
        accelerator.log({"epoch": epoch})
    
    # Create the pipeline using the trained modules and save it.
    accelerator.wait_for_everyone()
    if accelerator.is_main_process:
        unet = accelerator.unwrap_model(unet)
        pipeline = TuneAVideoPipeline.from_pretrained(
            pretrained_model_path,
            vae=vae,
            unet=unet,
        )
        pipeline.save_pretrained(output_path)

    accelerator.end_training()

    
def get_args_parser():
    parser = argparse.ArgumentParser('Decoding fMRI to reconstruct videos')
    # project parameters
    parser.add_argument('--config', type=str, default='configs/video_tune.yaml', help='path to config file')
    return parser

if __name__ == '__main__':
    args = get_args_parser()
    args = args.parse_args()
    config = OmegaConf.load(args.config)
    config.config_path = args.config

    main(**config)
    
        




    