import numpy as np
from numpy import *
import matplotlib.pyplot as plt
from tqdm import tqdm
import matplotlib
import datetime
import torch
import math
from ML_ETC import *

class DecentralizeOPTN2(object):

    def __init__(self, horizon, trial, num_player, num_arm, player_ranking, arm_ranking, player_mean):
        self.path = './ResultsData/decen/'

        self.horizon = horizon
        self.trials = trial

        self.p_lambda = 0.08
        self.epsilon = 10**(-10)

        # phased ETC algorithm
        self.varEpsilon = 0.2

        

        self.num_players = num_player
        self.num_arms = num_arm
        self.players_ranking = player_ranking
        self.arms_rankings = arm_ranking  
        self.players_mean = player_mean

        # UCB-D4
        self.beta = 1/(2*self.num_arms)
        self.gamma = 2
        print("player_ranking", self.players_ranking)
        print("arm_ranking", self.arms_rankings)
        print("player preference", self.players_mean)

        
        # self.pessimal_matching = self.get_pessimal_matching(self.players_ranking,self.arms_rankings).tolist()
        # print("pessimal matching",self.pessimal_matching)

        self.opt_matching = self.get_opt_matching(self.players_ranking,self.arms_rankings) #[1,2] p_0-1, p_1-2
           
        
        print("optimal matching",self.opt_matching)
        # print("players_mean", self.players_mean)
        
        print("Stable?", self.isUnstablePlayer(self.opt_matching))
        # At=np.zeros(self.num_players)
        # for a,p in enumerate(self.pessimal_matching):
        #         At[p]=a
        # print("Pessimal Matching Player:",At)

    def get_pessimal_matching(self, players_rankings, arms_rankings):
        # propose_order records the order arms should follow while proposing
        init_propose_order = np.zeros(self.num_arms, int)
        propose_order = init_propose_order
        # matched record whether a specific player is matched or not
        matched = np.zeros(self.num_arms, bool)
        # matching records the choice of a player for a specific arm
        matching = [[] for _ in range(self.num_players)]

        # Terminates if all matched
        while np.sum(matched) != self.num_arms:

            # arms propose at the same time
            for a_idx in range(self.num_arms):
                if not matched[a_idx]:
                    # p_proposal is the index of an arm
                    # propose_order is the vector, p_o[i] is the order of player i's next proposal
                    a_proposal = arms_rankings[a_idx][propose_order[a_idx]]
                    matching[a_proposal].append(a_idx)

            # arms choose its player
            for p_idx in range(self.num_players):
                p_choices = matching[p_idx]

                if len(p_choices) != 0:
                    # each arm chooses the its most preferable one
                    p_choice = next(
                        (x for x in players_rankings[p_idx] if x in matching[p_idx]), None)
                    # update arm's choice where there should only be one left
                    matching[p_idx] = [p_choice]
                    # update player's state of matched
                    for a_idx in p_choices:
                        matched[a_idx] = (a_idx == p_choice)
                        propose_order[a_idx] += (1 - (a_idx == p_choice))
        return np.squeeze(matching)
    def get_opt_matching(self,players_rankings,arms_rankings):
        arm_matching = self.Gale_Shapley(players_rankings)
        matching = [-1 for _ in range(self.num_players)]
        print(arm_matching)
        for a_idx, p_idx in enumerate(arm_matching):
            print(a_idx, p_idx)
            if len(p_idx) != 0:
                p_idx = p_idx[0]
                matching[p_idx] = a_idx
        
        return matching 

    def get_matching_from_GS(self, GSreturn):
        matching = [-1 for _ in range(self.num_players)]
        
        for a_idx, p_idx in enumerate(GSreturn):
            # print(a_idx, p_idx)
            if len(p_idx) != 0:
                p_idx = p_idx[0]
                matching[p_idx] = a_idx
        
        return matching 

    # def get_pessimal_matching(self,players_rankings,arms_rankings):
    #     # propose_order records the order arms should follow while proposing
    #     init_propose_order = np.zeros(self.num_arms, int)
    #     propose_order = init_propose_order
    #     # matched record whether a specific player is matched or not
    #     matched = np.zeros(self.num_arms, bool)
    #     # matching records the choice of a player for a specific arm
    #     matching = [[] for _ in range(self.num_players)]

    #     # Terminates if all matched
    #     while np.sum(matched) != self.num_arms:

    #         # arms propose at the same time
    #         for a_idx in range(self.num_arms):
    #             if not matched[a_idx]:
    #                 # p_proposal is the index of an arm
    #                 # propose_order is the vector, p_o[i] is the order of player i's next proposal
    #                 print("propose_order[a_idx]",propose_order[a_idx])
    #                 a_proposal = arms_rankings[a_idx][propose_order[a_idx]]
    #                 matching[a_proposal].append(a_idx)

    #         # arms choose its player
    #         for p_idx in range(self.num_players):
    #             p_choices = matching[p_idx]

    #             if len(p_choices) != 0:    
    #                 # each arm chooses the its most preferable one
    #                 p_choice = next((x for x in players_rankings[p_idx] if x in matching[p_idx]), None)
    #                 # update arm's choice where there should only be one left
    #                 matching[p_idx] = [p_choice]
    #                 # update player's state of matched
    #                 for a_idx in p_choices:
    #                     matched[a_idx] = (a_idx == p_choice)
    #                     propose_order[a_idx] += (1 - (a_idx == p_choice))
    #     return np.squeeze(matching)

    # def isUnstable(self, arm_matching):
    #     # arm_matching: [0,1,-1]
    #     # arm 0 matches player 0; arm 1 matches player 1; arm 2 matches nothing

    #     # if unstable return 1, otherwise return 0
    #     arm_matching = arm_matching.tolist()

    #     if -1 in arm_matching:
    #         return 1

    #     player_matching = np.ones(self.num_players)*(-1)
    #     for p_idx in range(self.num_players):
    #         if p_idx in arm_matching:
    #             player_matching[p_idx] = arm_matching.index(p_idx)

    #     if -1 in player_matching:
    #         return 1
        
    #     # find blocking pair
    #     for p_idx in range(self.num_players):
    #         for possible_arm_rank in range(self.players_ranking[p_idx].index(player_matching[p_idx])):
    #             arm = self.players_ranking[p_idx][possible_arm_rank]
    #             for possible_player_rank in range(self.arms_rankings[arm].index(arm_matching[arm])):
    #                 if self.arms_rankings[arm][possible_player_rank] == p_idx:
    #                     return 1
    #     return 0

    def isUnstable(self, arm_matching):
        # arm_matching: [0,1,-1]
        # arm 0 matches player 0; arm 1 matches player 1; arm 2 matches nothing

        # if unstable return 1, otherwise return 0
        arm_matching = arm_matching.tolist()

        # if -1 in arm_matching:
        #     return 1

        player_matching = np.ones(self.num_players)*(-1)
        for p_idx in range(self.num_players):
            if p_idx in arm_matching:
                player_matching[p_idx] = arm_matching.index(p_idx)
        player_matching = player_matching.astype(int)

        if -1 in player_matching:
            return 1

        for p_idx in range(self.num_players):
            a_1 = player_matching[p_idx]
            for p_prime in range(self.num_players):
                a_2 = player_matching[p_prime]
                if self.players_ranking[p_idx].index(a_2) < self.players_ranking[p_idx].index(a_1) and self.arms_rankings[a_2].index(p_idx) < self.arms_rankings[a_2].index(p_prime):
                    return 1

        # find blocking pair
        # for p_idx in range(self.num_players):
        #     for possible_arm_rank in range(self.players_ranking[p_idx].index(player_matching[p_idx])):
        #         arm = self.players_ranking[p_idx][possible_arm_rank]
        #         for possible_player_rank in range(self.arms_rankings[arm].index(arm_matching[arm])):
        #             if self.arms_rankings[arm][possible_player_rank] == p_idx:
        #                 return 1
        return 0

    def isUnOptimalStable(self, player_matching):
        # OptStable= True
        for p_idx in range(self.num_players):
            if player_matching[p_idx] != self.opt_matching[p_idx]:
                # OptStable=False
                return 1
        return 0

    def isUnstablePlayer(self, player_matching):
        # arm_matching: [0,1,-1]
        # arm 0 matches player 0; arm 1 matches player 1; arm 2 matches nothing

        # if unstable return 1, otherwise return 0
        # arm_matching = arm_matching.tolist()

        # if -1 in arm_matching:
        #     return 1

        # player_matching = np.ones(self.num_players)*(-1)
        # for p_idx in range(self.num_players):
        #     if p_idx in arm_matching:
        #         player_matching[p_idx] = arm_matching.index(p_idx)
        # player_matching = player_matching.astype(int)

        if -1 in player_matching:
            return 1

        for p_idx in range(self.num_players):
            a_1 = player_matching[p_idx]
            for p_prime in range(self.num_players):
                a_2 = player_matching[p_prime]
                if self.players_ranking[p_idx].index(a_2) < self.players_ranking[p_idx].index(a_1) and self.arms_rankings[a_2].index(p_idx) < self.arms_rankings[a_2].index(p_prime):
                    return 1

        # find blocking pair
        # for p_idx in range(self.num_players):
        #     for possible_arm_rank in range(self.players_ranking[p_idx].index(player_matching[p_idx])):
        #         arm = self.players_ranking[p_idx][possible_arm_rank]
        #         for possible_player_rank in range(self.arms_rankings[arm].index(arm_matching[arm])):
        #             if self.arms_rankings[arm][possible_player_rank] == p_idx:
        #                 return 1
        return 0


    
    
    def Gale_Shapley(self, player_ranking):
        # print(self.num_arms)
            # propose_order records the order players should follow while proposing
        init_propose_order = np.zeros(self.num_players, int)
        propose_order = init_propose_order
        # matched record whether a specific player is matched or not
        matched = np.zeros(self.num_players, bool)
        # matching records the choice of a player for a specific arm
        matching = [[] for _ in range(self.num_arms)]

        # Terminates if all matched
        while np.sum(matched) != self.num_players:

            # players propose at the same time
            for p_idx in range(self.num_players):
                if not matched[p_idx]:
                    # p_proposal is the index of an arm
                    # propose_order is the vector, p_o[i] is the order of player i's next proposal
                    # print('propose_order[p_idx]:', propose_order[p_idx])
                    # print('p_idx:', p_idx)
                    p_proposal = player_ranking[p_idx][propose_order[p_idx]]
                    # print('p_proposal:', p_proposal)
                    matching[p_proposal].append(p_idx)

            # arms choose its player
            for a_idx in range(self.num_arms):
                a_choices = matching[a_idx]

                if len(a_choices) != 0:    
                    # each arm chooses the its most preferable one
                    a_choice = next((x for x in self.arms_rankings[a_idx] if x in matching[a_idx]), None)
                    # update arm's choice where there should only be one left
                    matching[a_idx] = [a_choice]
                    # update player's state of matched
                    for p_idx in a_choices:
                        matched[p_idx] = (p_idx == a_choice)
                        propose_order[p_idx] += (1 - (p_idx == a_choice))
    
        return np.squeeze(matching)
    




    def run_phasedETC(self,Beta):

        # Using to save data
        regrets_trials = np.zeros([self.num_players, self.trials, self.horizon])
        rewards_trials = np.zeros([self.num_players, self.trials, self.horizon])
        unstable_trials = np.zeros([self.trials, self.horizon])

        # cumulative_unstable = np.zeros([self.trials, self.horizon])
        # averaged_unstable = np.zeros([self.trials, self.horizon])
        # cumulative_regrets = np.zeros([self.num_players, self.trials, self.horizon])
        # averaged_regrets = np.zeros([self.num_players, self.trials, self.horizon])

        for trial in tqdm(range(self.trials), ascii=True, desc="Running the decentralized phasedETC"):
            unstable_one_trial = np.ones(self.horizon)
            regrets_one_trial = np.zeros([self.num_players, self.horizon])
            rewards_one_trial = np.zeros([self.num_players, self.horizon])

            players_es_mean = [np.zeros(self.num_arms) for j in range(self.num_players)]
            players_count = [np.zeros(self.num_arms) for j in range(self.num_players)]


            # Index_estimation 
            indexs = np.ones(self.num_players)*self.num_players-1
            arms = np.zeros(self.num_players)

            At = np.ones(self.num_players)*(-1)
            last_pulled = np.ones(self.num_arms)*(-1)

            for round in range(self.num_players):
                # print(round)
                for p_idx in range(self.num_players):
                    At[p_idx] = arms[p_idx]

                At = At.astype(int)
                last_pulled = np.ones(self.num_arms)*(-1)
                for a_idx in range(self.num_arms):
                    if a_idx in At:
                        for p_rank in range(self.num_players):
                            if At[self.arms_rankings[a_idx][p_rank]]==a_idx:
                                last_pulled[a_idx] = self.arms_rankings[a_idx][p_rank]
                                break
                last_pulled = last_pulled.astype(int)
               
                for p_idx in range(self.num_players):
                    if last_pulled[At[p_idx]]==p_idx:
                        regrets_one_trial[p_idx][round]=max(0,self.players_mean[p_idx][self.opt_matching[p_idx]] - self.players_mean[p_idx][At[p_idx]])
                        rewards_one_trial[p_idx][round] = self.players_mean[p_idx][At[p_idx]]
                        if At[p_idx]==0:
                            indexs[p_idx]=round
                            arms[p_idx] = 1
                    else:
                        regrets_one_trial[p_idx][round] = self.players_mean[p_idx][self.opt_matching[p_idx]]
                        rewards_one_trial[p_idx][round] = 0
                
            # print(indexs)
            current_player_ranking = [ np.zeros(self.num_arms) for j in range(self.num_players)]
            current_match = np.zeros(self.num_arms)

            for round in range(self.num_players,self.horizon):
                
                i = math.floor(math.log(round,2))
                # exploration
                if round-2**i+1 <= self.num_arms*math.floor(i**self.varEpsilon):
                    
                    for p_idx in range(self.num_players):
                        At[p_idx] = (round+2+indexs[p_idx]-2**i)%self.num_arms
                    # print("Explore-------round ",round, At)
                    last_pulled = np.ones(self.num_arms)*(-1)
                    for a_idx in range(self.num_arms):
                        if a_idx in At:
                            for p_rank in range(self.num_players):
                                if At[self.arms_rankings[a_idx][p_rank]]==a_idx:
                                    last_pulled[a_idx] = self.arms_rankings[a_idx][p_rank]
                                    break
                    # Here: whether stable matching according to last_pulled.
                    last_pulled = last_pulled.astype(int)
                    unstable_one_trial[round] = 1
                    
                    At = At.astype(int)
                    for p_idx in range(self.num_players):
                        if last_pulled[At[p_idx]]==p_idx:
                            # update
                            # reward = np.random.binomial(1, self.players_mean[p_idx][At[p_idx]])
                            reward = np.random.normal(
                                loc=self.players_mean[p_idx][At[p_idx]], scale=1.0, size=None)
                            players_count[p_idx][At[p_idx]]+=1
                            players_es_mean[p_idx][At[p_idx]]+= (reward-players_es_mean[p_idx][At[p_idx]]) / players_count[p_idx][At[p_idx]]
                            
                            # record
                            regrets_one_trial[p_idx][round]=max(0,self.players_mean[p_idx][self.opt_matching[p_idx]] - self.players_mean[p_idx][At[p_idx]])
                            rewards_one_trial[p_idx][round] = self.players_mean[p_idx][At[p_idx]]
                        else:
                            regrets_one_trial[p_idx][round] = self.players_mean[p_idx][self.opt_matching[p_idx]]
                            rewards_one_trial[p_idx][round] = 0
                # commit
                else:
                    for j in range(self.num_players):
                        current_player_ranking[j] = np.argsort(-players_es_mean[j])
                
                    current_match = self.Gale_Shapley(current_player_ranking)
                    
                    # At=np.zeros(self.num_players)

                    # for a_idx,p_idx in enumerate(current_match):
                    #     At[p_idx] = a_idx

                    # print("-----round ",round)
                    
                    # Here: whether stable matching according to last_pulled.
                    matching = self.get_matching_from_GS(current_match)
                    # print(matching)
                    unstable_one_trial[round] = self.isUnOptimalStable(matching)
                    # print(unstable_one_trial[round] )

                    # for a_idx, p_idx in enumerate(current_match):
                    #     # print("Commit: (",p_idx, a_idx,")" )
                    #     regrets_one_trial[p_idx][round]=max(0,self.players_mean[p_idx][self.opt_matching[p_idx]] - self.players_mean[p_idx][a_idx])
                    #     rewards_one_trial[p_idx][round] = self.players_mean[p_idx][a_idx]


                    
                    for p_idx in range(self.num_players):
                        if matching[p_idx]!=-1:
                            regrets_one_trial[p_idx][round]=max(0,self.players_mean[p_idx][self.opt_matching[p_idx]] - self.players_mean[p_idx][matching[p_idx]])
                            rewards_one_trial[p_idx][round] = self.players_mean[p_idx][matching[p_idx]]
                        else:
                            regrets_one_trial[p_idx][round]= self.players_mean[p_idx][self.opt_matching[p_idx]]
                            rewards_one_trial[p_idx][round] = 0

                    
            for i in range(self.num_players):
                # cumulative_regrets[i][trial]=np.cumsum(np.array(regrets_one_trial[i]), axis=0)
                # averaged_regrets[i][trial] = cumulative_regrets[i][trial]/range(1,self.horizon+1)
                regrets_trials[i][trial] = regrets_one_trial[i]
                rewards_trials[i][trial] = rewards_one_trial[i]

            unstable_trials[trial] = unstable_one_trial
            # cumulative_unstable[trial] = np.cumsum(np.array(unstable_one_trial), axis=0)
            # averaged_unstable[trial] = cumulative_unstable[trial]/range(1,self.horizon+1)

            # cumulative_regrets = np.array(cumulative_regrets)
            # averaged_regrets = np.array(averaged_regrets)
            # cumulative_unstable = np.array(cumulative_unstable)
            print(unstable_one_trial)
            # print(regrets_one_trial[0])
            print(sum(unstable_one_trial))

       
        np.savez('./ResultsData/DecenOpt_PhasedETC_Beta_'+str(Beta)+'N_'+str(self.num_players)+'_Regret.npz', regret=regrets_trials)
        np.savez('./ResultsData/DecenOpt_PhasedETC_Beta_'+str(Beta)+'N_'+str(self.num_players)+'_Reward.npz', reward=rewards_trials)
        np.savez('./ResultsData/DecenOpt_PhasedETC_Beta_'+str(Beta)+'N_'+str(self.num_players)+'_Unstable.npz', unstable=unstable_trials)
        


     


    def run_ETGS(self, Beta):
        regrets_trials = np.zeros(
            [self.num_players, self.trials, self.horizon])
        rewards_trials = np.zeros(
            [self.num_players, self.trials, self.horizon])
        unstable_trials = np.zeros([self.trials, self.horizon])
        for trial in tqdm(range(self.trials), ascii=True, desc="Running the ETGS"):
            unstable_one_trial = np.ones(self.horizon)
            regrets_one_trial = np.zeros([self.num_players, self.horizon])
            rewards_one_trial = np.zeros([self.num_players, self.horizon])
            players_es_mean = [np.zeros(self.num_arms)
                               for j in range(self.num_players)]
            ucb = [np.zeros(self.num_arms) for j in range(self.num_players)]
            lcb = [np.zeros(self.num_arms) for j in range(self.num_players)]
            players_count = [np.zeros(self.num_arms)
                             for j in range(self.num_players)]
            
            
            # Index_estimation
            indexs = np.ones(self.num_players)
            for p_idx in range(self.num_players):
                indexs[p_idx] = self.arms_rankings[0].index(p_idx)
            # print(indexs) 

            # arm to match for each player
            At = np.ones(self.num_players)*(-1)
            last_pulled = np.ones(self.num_arms)*(-1)
            for round in range(self.num_players):
                unstable_one_trial[round]=1
                for p_idx in range(self.num_players):
                    regrets_one_trial[p_idx][round] = self.players_mean[p_idx][self.opt_matching[p_idx]]


            current_player_ranking = [
                np.zeros(self.num_arms, int) for j in range(self.num_players)]
            current_match = np.zeros(self.num_arms)

            l = 0
            flag = False
            while (True):
                estimated_well = np.zeros(self.num_players, bool)
                low_limit = int(self.num_players + l + 1.2**l - 2)
                high_limit =int(self.num_players + l + 1.2**(l + 1) - 2)
                l += 1
                for round in range(low_limit, high_limit):
                    if round == self.horizon - 1:
                        flag = True
                        break 
                    for p_idx in range(self.num_players):
                        if not estimated_well[p_idx]:
                            At[p_idx] = (p_idx + round - 1) % self.num_arms
                    last_pulled = np.ones(self.num_arms)*(-1)
                    for a_idx in range(self.num_arms):
                        if a_idx in At:
                            # find most matching player for arm
                            for p_rank in range(self.num_players):
                                if At[self.arms_rankings[a_idx][p_rank]] == a_idx:
                                    last_pulled[a_idx] = self.arms_rankings[a_idx][p_rank]
                                    break
                    # Here: whether stable matching according to last_pulled.
                    last_pulled = last_pulled.astype(int)
                    
                    unstable_one_trial[round] = 1
                    # if round % 996 == 0:
                    #     print(last_pulled, unstable_one_trial[round], estimated_well, self.num_arms, At, lcb[0])

                    At = At.astype(int)
                    for p_idx in range(self.num_players):
                        if last_pulled[At[p_idx]] == p_idx:
                            # update
                            reward = np.random.normal(
                                loc=self.players_mean[p_idx][At[p_idx]], scale=1.0, size=None)

                            players_count[p_idx][At[p_idx]] += 1
                            players_es_mean[p_idx][At[p_idx]] += (
                                reward - players_es_mean[p_idx][At[p_idx]]) / players_count[p_idx][At[p_idx]]

                            # record
                            regrets_one_trial[p_idx][round] = max(
                                0, self.players_mean[p_idx][self.opt_matching[p_idx]] - self.players_mean[p_idx][At[p_idx]])
                            rewards_one_trial[p_idx][round] = self.players_mean[p_idx][At[p_idx]]
                        else:
                            regrets_one_trial[p_idx][round] = self.players_mean[p_idx][self.opt_matching[p_idx]]
                            rewards_one_trial[p_idx][round] = 0

                # Compute UCB and LCB
                for p_idx in range(self.num_players):
                    for a_idx in range(self.num_arms):
                        if players_count[p_idx][a_idx] == 0:
                            ucb[p_idx][a_idx] = float("inf")
                            lcb[p_idx][a_idx] = -float("inf")
                        else:
                            value = math.sqrt(
                                6 * math.log( 2) / players_count[p_idx][a_idx])
                            ucb[p_idx][a_idx] = players_es_mean[p_idx][a_idx] + value
                            lcb[p_idx][a_idx] = players_es_mean[p_idx][a_idx] - value
                    rank_ucb = np.argsort(-ucb[p_idx])
                    rank_lcb = np.argsort(-lcb[p_idx])
                    for a_idx in range(self.num_arms):
                        if a_idx < self.num_players - 1:
                            if lcb[p_idx][rank_lcb[a_idx]] > ucb[p_idx][rank_ucb[a_idx + 1]]:
                                estimated_well[p_idx] = True
                            else:
                                estimated_well[p_idx] = False
                                break
                        elif a_idx >= self.num_players:
                            if lcb[p_idx][rank_lcb[self.num_players - 1]] > ucb[p_idx][rank_ucb[a_idx]]:
                                estimated_well[p_idx] = True
                            else:
                                estimated_well[p_idx] = False
                                break
                    if estimated_well[p_idx] == True:
                        current_player_ranking[p_idx] = rank_ucb

                start = self.num_players + high_limit

                for p_idx in range(self.num_players):
                    if estimated_well[p_idx]:
                        At[p_idx] = current_player_ranking[p_idx][0]

                count = 0
                for p_idx in range(self.num_players):
                    if estimated_well[p_idx] == True:
                        count += 1
                if count == self.num_players:
                    break

                if flag:
                    break

            for round in range(start + 1, self.horizon):
                current_match = self.Gale_Shapley(current_player_ranking)
                #  Here: whether stable matching according to last_pulled.

                matching = self.get_matching_from_GS(current_match)
                # unstable_one_trial[round] = self.isUnstablePlayer(matching)

                unstable_one_trial[round] = self.isUnOptimalStable(matching)
                # print(matching, unstable_one_trial[round])
                # unstable_one_trial[round] = self.isUnstablePlayer(
                #     self.get_matching_from_GS(current_match))

                # for a_idx, p_idx in enumerate(current_match):
                #     regrets_one_trial[p_idx][round] = max(
                #         0, self.players_mean[p_idx][self.opt_matching[p_idx]] - self.players_mean[p_idx][a_idx])
                #     rewards_one_trial[p_idx][round] = self.players_mean[p_idx][a_idx]

                for p_idx in range(self.num_players):
                    if matching[p_idx] != -1:
                        regrets_one_trial[p_idx][round] = max(0, self.players_mean[p_idx][self.opt_matching[p_idx]] - self.players_mean[p_idx][matching[p_idx]])
                        rewards_one_trial[p_idx][round] = self.players_mean[p_idx][matching[p_idx]]
                    else:
                        regrets_one_trial[p_idx][round] = self.players_mean[p_idx][self.opt_matching[p_idx]]
                        rewards_one_trial[p_idx][round] = 0

            for i in range(self.num_players):
                regrets_trials[i][trial] = regrets_one_trial[i]
                rewards_trials[i][trial] = rewards_one_trial[i]
                # print(regrets_one_trial[i])

            unstable_trials[trial] = unstable_one_trial
            print(unstable_one_trial)
            # print(regrets_one_trial[0])
            print(sum(unstable_one_trial))

        np.savez('./ResultsData/DecenOpt_ETGS_Beta_'+str(Beta)+'N_' +
                 str(self.num_players)+'_Regret.npz', regret=regrets_trials)
        np.savez('./ResultsData/DecenOpt_ETGS_Beta_'+str(Beta)+'N_' +
                 str(self.num_players)+'_Reward.npz', reward=rewards_trials)
        np.savez('./ResultsData/DecenOpt_ETGS_Beta_'+str(Beta)+'N_' +
                 str(self.num_players)+'_Unstable.npz', unstable=unstable_trials)
        print(unstable_trials)
        cumulative_unstable = np.cumsum(np.array(unstable_trials), axis=1)
        for i in range(self.trials):
            print(cumulative_unstable[i][-1])
    def run_MLETC(self, Beta):
        regrets_trials = np.zeros(
            [self.num_players, self.trials, self.horizon])
        rewards_trials = np.zeros(
            [self.num_players, self.trials, self.horizon])
        unstable_trials = np.zeros([self.trials, self.horizon])
        for trial in tqdm(range(self.trials), ascii=True, desc="Running the ETGS"):
            unstable_one_trial = np.ones(self.horizon)
            regrets_one_trial = np.zeros([self.num_players, self.horizon])
            rewards_one_trial = np.zeros([self.num_players, self.horizon])
            players_es_mean = [np.zeros(self.num_arms)
                               for j in range(self.num_players)]
            ucb = [np.zeros(self.num_arms) for j in range(self.num_players)]
            lcb = [np.zeros(self.num_arms) for j in range(self.num_players)]
            players_count = [np.zeros(self.num_arms)
                             for j in range(self.num_players)]
            
            
            # Index_estimation
            indexs = np.ones(self.num_players)
            for p_idx in range(self.num_players):
                indexs[p_idx] = self.arms_rankings[0].index(p_idx)
            # print(indexs) 

            # arm to match for each player
            At = np.ones(self.num_players)*(-1)
            last_pulled = np.ones(self.num_arms)*(-1)
            for round in range(self.num_players):
                unstable_one_trial[round]=1
                for p_idx in range(self.num_players):
                    regrets_one_trial[p_idx][round] = self.players_mean[p_idx][self.opt_matching[p_idx]]


            current_player_ranking = [
                np.zeros(self.num_arms, int) for j in range(self.num_players)]
            current_match = np.zeros(self.num_arms)

            l = 0
            flag = False
            while (True):
                estimated_well = np.zeros(self.num_players, bool)
                low_limit = int(self.num_players + l * 20 * self.num_arms + (l)*self.num_players *self.num_arms)
                high_limit =int(self.num_players + (l + 1) * 20 * self.num_arms+(l)*self.num_players *self.num_arms)
                l += 1
                for round in range(low_limit, high_limit):
                    if round == self.horizon - 1:
                        flag = True
                        break 
                    for p_idx in range(self.num_players):
                        if not estimated_well[p_idx]:
                            At[p_idx] = (p_idx + round - 1) % self.num_arms
                    last_pulled = np.ones(self.num_arms)*(-1)
                    for a_idx in range(self.num_arms):
                        if a_idx in At:
                            # find most matching player for arm
                            for p_rank in range(self.num_players):
                                if At[self.arms_rankings[a_idx][p_rank]] == a_idx:
                                    last_pulled[a_idx] = self.arms_rankings[a_idx][p_rank]
                                    break
                    # Here: whether stable matching according to last_pulled.
                    last_pulled = last_pulled.astype(int)
                    
                    unstable_one_trial[round] = 1
                    # if round % 996 == 0:
                    #     print(last_pulled, unstable_one_trial[round], estimated_well, self.num_arms, At, lcb[0])

                    At = At.astype(int)
                    for p_idx in range(self.num_players):
                        if last_pulled[At[p_idx]] == p_idx:
                            # update
                            reward = np.random.normal(
                                loc=self.players_mean[p_idx][At[p_idx]], scale=1.0, size=None)

                            players_count[p_idx][At[p_idx]] += 1
                            players_es_mean[p_idx][At[p_idx]] += (
                                reward - players_es_mean[p_idx][At[p_idx]]) / players_count[p_idx][At[p_idx]]

                            # record
                            regrets_one_trial[p_idx][round] = max(
                                0, self.players_mean[p_idx][self.opt_matching[p_idx]] - self.players_mean[p_idx][At[p_idx]])
                            rewards_one_trial[p_idx][round] = self.players_mean[p_idx][At[p_idx]]
                        else:
                            regrets_one_trial[p_idx][round] = self.players_mean[p_idx][self.opt_matching[p_idx]]
                            rewards_one_trial[p_idx][round] = 0

                # Compute UCB and LCB
                for p_idx in range(self.num_players):
                    for a_idx in range(self.num_arms):
                        if players_count[p_idx][a_idx] == 0:
                            ucb[p_idx][a_idx] = float("inf")
                            lcb[p_idx][a_idx] = -float("inf")
                        else:
                            value = math.sqrt(
                                6 * math.log( 2) / players_count[p_idx][a_idx])
                            ucb[p_idx][a_idx] = players_es_mean[p_idx][a_idx] + value
                            lcb[p_idx][a_idx] = players_es_mean[p_idx][a_idx] - value
                    rank_ucb = np.argsort(-ucb[p_idx])
                    rank_lcb = np.argsort(-lcb[p_idx])
                    for a_idx in range(self.num_arms):
                        if a_idx < self.num_players - 1:
                            if lcb[p_idx][rank_lcb[a_idx]] > ucb[p_idx][rank_ucb[a_idx + 1]]:
                                estimated_well[p_idx] = True
                            else:
                                estimated_well[p_idx] = False
                                break
                        elif a_idx >= self.num_players:
                            if lcb[p_idx][rank_lcb[self.num_players - 1]] > ucb[p_idx][rank_ucb[a_idx]]:
                                estimated_well[p_idx] = True
                            else:
                                estimated_well[p_idx] = False
                                break
                    if estimated_well[p_idx] == True:
                        current_player_ranking[p_idx] = rank_ucb

                start = self.num_players + high_limit

                for p_idx in range(self.num_players):
                    if estimated_well[p_idx]:
                        At[p_idx] = current_player_ranking[p_idx][0]

                count = 0
                for p_idx in range(self.num_players):
                    if estimated_well[p_idx] == True:
                        count += 1
                if count == self.num_players:
                    break

                if flag:
                    break
                for round in range(high_limit, high_limit + self.num_players * self.num_arms):
                    regrets_one_trial[p_idx][round] = self.players_mean[p_idx][self.opt_matching[p_idx]]
                    rewards_one_trial[p_idx][round] = 0
                    unstable_one_trial[round] = 1

            for round in range(start + 1, self.horizon):
                current_match = self.Gale_Shapley(current_player_ranking)
                #  Here: whether stable matching according to last_pulled.

                matching = self.get_matching_from_GS(current_match)
                # unstable_one_trial[round] = self.isUnstablePlayer(matching)

                unstable_one_trial[round] = self.isUnOptimalStable(matching)
                # print(matching, unstable_one_trial[round])
                # unstable_one_trial[round] = self.isUnstablePlayer(
                #     self.get_matching_from_GS(current_match))

                # for a_idx, p_idx in enumerate(current_match):
                #     regrets_one_trial[p_idx][round] = max(
                #         0, self.players_mean[p_idx][self.opt_matching[p_idx]] - self.players_mean[p_idx][a_idx])
                #     rewards_one_trial[p_idx][round] = self.players_mean[p_idx][a_idx]

                for p_idx in range(self.num_players):
                    if matching[p_idx] != -1:
                        regrets_one_trial[p_idx][round] = max(0, self.players_mean[p_idx][self.opt_matching[p_idx]] - self.players_mean[p_idx][matching[p_idx]])
                        rewards_one_trial[p_idx][round] = self.players_mean[p_idx][matching[p_idx]]
                    else:
                        regrets_one_trial[p_idx][round] = self.players_mean[p_idx][self.opt_matching[p_idx]]
                        rewards_one_trial[p_idx][round] = 0

            for i in range(self.num_players):
                regrets_trials[i][trial] = regrets_one_trial[i]
                rewards_trials[i][trial] = rewards_one_trial[i]
                # print(regrets_one_trial[i])

            unstable_trials[trial] = unstable_one_trial
            print(unstable_one_trial)
            # print(regrets_one_trial[0])
            print(sum(unstable_one_trial))

        np.savez('./ResultsData/DecenOpt_MLETC_Beta_'+str(Beta)+'N_' +
                 str(self.num_players)+'_Regret.npz', regret=regrets_trials)
        np.savez('./ResultsData/DecenOpt_MLETC_Beta_'+str(Beta)+'N_' +
                 str(self.num_players)+'_Reward.npz', reward=rewards_trials)
        np.savez('./ResultsData/DecenOpt_MLETC_Beta_'+str(Beta)+'N_' +
                 str(self.num_players)+'_Unstable.npz', unstable=unstable_trials)
        print(unstable_trials)
        cumulative_unstable = np.cumsum(np.array(unstable_trials), axis=1)
        for i in range(self.trials):
            print(cumulative_unstable[i][-1])

    # def run_ML_ETC(self, Beta):
    #     regrets_trials = np.zeros(
    #         [self.num_players, self.trials, self.horizon])
    #     rewards_trials = np.zeros(
    #         [self.num_players, self.trials, self.horizon])
    #     unstable_trials = np.zeros([self.trials, self.horizon])
    #     unstable_one_trial = np.ones(self.horizon)
    #     regrets_one_trial = np.zeros([self.num_players, self.horizon])
    #     rewards_one_trial = np.zeros([self.num_players, self.horizon])
    #     for trial in tqdm(range(self.trials), ascii=True, desc="Running the decentralized ML-ETC"):
    #         t_total = np.zeros((self.num_players, self.num_arms), int)
    #         t_total_collision = np.zeros(
    #             (self.num_players, self.num_arms), int)
    #         information = np.zeros(
    #             (self.num_players, self.num_arms, self.num_players), int) - 1
    #         agent = list(np.arange(self.num_players))
    #         arm = list(np.arange(self.num_arms))
    #         match = self.opt_matching
    #         u = np.zeros((self.num_players, self.num_arms))
    #         time = np.zeros((self.num_players, self.num_arms), int)
    #         pull = np.zeros(self.num_players, int)
    #         dot_number = int(self.horizon)
    #         regret = np.zeros(dot_number)
    #         reward = np.zeros(dot_number)
    #         information, t_total, t_total_collision = index_assignment(self.arms_rankings, self.num_players, self.num_arms, information, t_total,
    #                                                                    t_total_collision)
    #         information, t_total, t_total_collision = information_access(self.arms_rankings, self.num_players, self.num_arms, information, t_total,
    #                                                                      t_total_collision)
    #         leader = whether_leader(
    #             information, agent[0], arm, agent, self.num_players)
    #         follower = set()
    #         for i in agent:
    #             if i not in leader:
    #                 follower.add(i)
    #         follower = list(follower)
    #         success = np.zeros(self.num_players, int)
    #         # print("leader", leader)
    #         success, t_total, t_total_collision, reward = success_information(leader, follower, arm, success, self.arms_rankings, information, self.num_players,
    #                                                                           t_total, t_total_collision, agent, reward, self.num_arms, self.players_mean)

    #         while (success == 0).any():
    #             count = 0
    #             for r in reward:
    #                 if r != 0:
    #                     count += 1
    #             # print(count, sum(reward))
    #             u, time, t_total, t_total_collision, reward = exploration(arm, agent, self.arms_rankings, u, self.horizon, time, self.players_mean,
    #                                                                       self.num_players, t_total,
    #                                                                       t_total_collision, reward)
    #             success = np.zeros(self.num_players, int)
    #             for m in range(self.num_players):
    #                 success[m] = whether_success(m, u, time, arm, self.horizon)
    #             success, t_total, t_total_collision, reward = success_information(leader, follower, arm, success,
    #                                                                               self.arms_rankings, information, self.num_players,
    #                                                                               t_total, t_total_collision, agent, reward, self.num_arms, self.players_mean)
    #         pull, arm, t_total, t_total_collision, reward = GS_arm(information, u, leader, self.arms_rankings, arm, follower, self.num_players,
    #                                                                agent, self.num_arms, t_total, t_total_collision, reward, self.players_mean)
    #         for i in leader:
    #             agent.remove(i)
    #             if pull[i] in arm:
    #                 arm.remove(pull[i])
    #         while len(agent) > 0:
    #             leader = whether_leader(
    #                 information, agent[0], arm, agent, self.num_players)
    #             for i in leader:
    #                 follower.remove(i)
    #             success = np.ones(self.num_players, int)
    #             for m in agent:
    #                 success[m] = whether_success(
    #                     m, u, time, arm, self.horizon)
    #             success, t_total, t_total_collision, reward = success_information(leader, follower, arm, success,
    #                                                                               self.arms_rankings, information, self.num_players,
    #                                                                               t_total, t_total_collision,
    #                                                                               agent, reward, self.num_arms, self.players_mean)
    #             while (success == 0).any():
    #                 u, time, t_total, t_total_collision, reward = exploration(arm, agent, self.arms_rankings, u, self.horizon, time,
    #                                                                           value, self.num_players, t_total,
    #                                                                           t_total_collision, reward)
    #                 success = np.ones(self.num_players, int)
    #                 for m in agent:
    #                     success[m] = whether_success(
    #                         m, u, time, arm, self.horizon)
    #                 success, t_total, t_total_collision, reward = success_information(leader, follower, arm, success,
    #                                                                                   self.arms_rankings, information, self.num_players,
    #                                                                                   t_total, t_total_collision, agent, reward, self.num_arms, self.players_mean)
    #             pull, arm, t_total, t_total_collision, reward = GS_arm(information, u, leader, self.arms_rankings, arm,
    #                                                                    follower, self.num_players, agent, self.num_arms, t_total,
    #                                                                    t_total_collision, reward, self.players_mean)
                
    #             for i in leader:
    #                 agent.remove(leader)
    #                 if pull[i] in arm:
    #                     arm.remove(pull[i])
    #         optimal_reward = 0
    #         for i in range(self.num_players):
    #             optimal_reward = optimal_reward + \
    #                 self.players_mean[i][match[i]]
    #         for j in range(dot_number):
    #             if reward[j] != 0:
    #                 regret[j] = optimal_reward * j - reward[j]
    #             else:
    #                 earn = 0
    #                 t = np.zeros(self.num_players)
    #                 for i in range(self.num_players):
    #                     for k in range(self.num_arms):
    #                         earn = t_total[i, k] * \
    #                             self.players_mean[i][k] + earn
    #                         t[i] = t[i] + t_total_collision[i, k]
    #                 regret[j] = 0 - earn
    #                 for i in range(self.num_players):
    #                     if self.horizon < t[i]:
    #                         print('fail')
    #                     regret[j] = (
    #                         t[i]) * (self.players_mean[i][match[i]]) + regret[j]
    #             regret[0] = 0
    #         print(regret)
    #         for tt in range(self.horizon):
    #             unstable_one_trial[tt] = min((regret[tt] - regret[tt-1]), 1)
    #             # print(unstable_one_trial[tt])
    #         for i in range(self.num_players):
    #             regrets_trials[i][trial] = regrets_one_trial[i]
    #             rewards_trials[i][trial] = rewards_one_trial[i]

    #         unstable_trials[trial] = unstable_one_trial
    #         print(sum(unstable_one_trial))
    #     np.savez('./ResultsData/Decen_ML-ETC_Beta_'+str(Beta)+'_N_' +
    #              str(self.num_players)+'_Regret.npz', regret=regrets_trials)
    #     np.savez('./ResultsData/Decen_ML-ETC_Beta_'+str(Beta)+'_N_' +
    #              str(self.num_players)+'_Reward.npz', reward=rewards_trials)
    #     np.savez('./ResultsData/Decen_ML-ETC_Beta_'+str(Beta)+'_N_' +
    #              str(self.num_players)+'_Unstable.npz', unstable=unstable_trials)

    def run_adaptiveGS(self, Beta):
        regrets_trials = np.zeros(
            [self.num_players, self.trials, self.horizon])
        rewards_trials = np.zeros(
            [self.num_players, self.trials, self.horizon])
        unstable_trials = np.zeros([self.trials, self.horizon])
        for trial in tqdm(range(self.trials), ascii=True, desc="Running the AdaptiveGS"):
            unstable_one_trial = np.ones(self.horizon)
            regrets_one_trial = np.zeros([self.num_players, self.horizon])
            rewards_one_trial = np.zeros([self.num_players, self.horizon])
            players_es_mean = [np.zeros(self.num_arms)
                               for j in range(self.num_players)]
            ucb = [np.zeros(self.num_arms) for j in range(self.num_players)]
            lcb = [np.zeros(self.num_arms) for j in range(self.num_players)]
            players_count = [np.zeros(self.num_arms)
                             for j in range(self.num_players)]
            # Index_estimation
            indexs = np.ones(self.num_players)*self.num_players-1
            arms = np.zeros(self.num_players)
            # arm to match for each player
            At = np.ones(self.num_players)*(-1)
            last_pulled = np.ones(self.num_arms)*(-1)

            

            for round in range(self.num_players):
                # At = arm = a1
                for p_idx in range(self.num_players):
                    At[p_idx] = arms[p_idx]

                At = At.astype(int)
                last_pulled = np.ones(self.num_arms)*(-1)
                for a_idx in range(self.num_arms):
                    if a_idx in At:
                        # find most matching player for arm
                        for p_rank in range(self.num_players):
                            # bar_At = At = a1
                            if At[self.arms_rankings[a_idx][p_rank]] == a_idx:
                                last_pulled[a_idx] = self.arms_rankings[a_idx][p_rank]
                                break
                last_pulled = last_pulled.astype(int)

                for p_idx in range(self.num_players):
                    if last_pulled[At[p_idx]] == p_idx:
                        regrets_one_trial[p_idx][round] = max(
                            0, self.players_mean[p_idx][self.opt_matching[p_idx]] - self.players_mean[p_idx][At[p_idx]])
                        rewards_one_trial[p_idx][round] = self.players_mean[p_idx][At[p_idx]]
                        if At[p_idx] == 0:
                            indexs[p_idx] = round
                            arms[p_idx] = 1
                    else:
                        regrets_one_trial[p_idx][round] = self.players_mean[p_idx][self.opt_matching[p_idx]]
                        rewards_one_trial[p_idx][round] = 0
                
                unstable_one_trial[round] = 1

            current_player_ranking = [
                np.zeros(self.num_arms, int) for j in range(self.num_players)]
            current_match = np.zeros(self.num_arms)

            l = 0
            flag = False

            plau_set = []
            for p_idx in range(self.num_players):
                plau_arms = set(list(range(self.num_arms)))
                plau_set.append(plau_arms)
            estimated_well = np.zeros(self.num_players)
            tt = self.num_players
            full_set = set(range(self.num_arms))
            delete_arms = [ set() for _ in range(self.num_players)]
            while (True):
                explore_length = 1
                plau_list = []
                for pid in range(self.num_players):
                    if not estimated_well[pid]:
                        explore_length *= len(plau_set[pid])
                    plau_list.append(list(plau_set[pid]))
                collision_times = np.zeros(self.num_players)
                for p_idx in range(self.num_players):
                    if estimated_well[p_idx]:
                            a_opt = plau_list[p_idx][0]
                            for p_prime in range(self.num_players):
                                if a_opt in plau_set[p_prime] and self.arms_rankings[a_opt].index(p_idx) > self.arms_rankings[a_opt].index(p_prime):
                                    collision_times[p_idx] += explore_length / len(plau_set[p_prime])
                for round in range(explore_length): 
                    for p_idx in range(self.num_players):
                        if not estimated_well[p_idx]:
                            At[p_idx] = plau_list[p_idx][round % len(plau_list[p_idx])]
                            reward = np.random.normal(
                                        loc=self.players_mean[p_idx][At[p_idx]], scale=1.0, size=None)        
                            players_count[p_idx][At[p_idx]] += 1
                            players_es_mean[p_idx][At[p_idx]] += (
                                    reward - players_es_mean[p_idx][At[p_idx]]) / players_count[p_idx][At[p_idx]]

                            # record
                            regrets_one_trial[p_idx][tt] = max(
                                0, self.players_mean[p_idx][self.opt_matching[p_idx]] - self.players_mean[p_idx][At[p_idx]])
                            rewards_one_trial[p_idx][tt] = self.players_mean[p_idx][At[p_idx]]
                        if estimated_well[p_idx]:
                            At[p_idx] = plau_list[p_idx][0]
                            if round < collision_times[p_idx]:
                                reward = 0
                                regrets_one_trial[p_idx][tt] = max(
                                    0, self.players_mean[p_idx][self.opt_matching[p_idx]])
                                rewards_one_trial[p_idx][tt] = self.players_mean[p_idx][At[p_idx]]
                            else:
                                reward = np.random.normal(
                                        loc=self.players_mean[p_idx][At[p_idx]], scale=1.0, size=None)        
                                players_count[p_idx][At[p_idx]] += 1
                                players_es_mean[p_idx][At[p_idx]] += (
                                        reward - players_es_mean[p_idx][At[p_idx]]) / players_count[p_idx][At[p_idx]]

                                # record
                                regrets_one_trial[p_idx][tt] = max(
                                    0, self.players_mean[p_idx][self.opt_matching[p_idx]] - self.players_mean[p_idx][At[p_idx]])
                                rewards_one_trial[p_idx][tt] = self.players_mean[p_idx][At[p_idx]]
                        last_pulled = np.ones(self.num_arms) * (-1)
                        for a_idx in range(self.num_arms):
                            if a_idx in At:
                                # find most matching player for arm
                                for p_rank in range(self.num_players):
                                    if At[self.arms_rankings[a_idx][p_rank]] == a_idx:
                                        last_pulled[a_idx] = self.arms_rankings[a_idx][p_rank]
                                        break
                        # Here: whether stable matching according to last_pulled.
                        last_pulled = last_pulled.astype(int)

                        matching_player = np.ones(self.num_players) * (-1)
                        for a_idx in range(self.num_arms):
                            if last_pulled[a_idx] != -1:
                                 matching_player[last_pulled[a_idx]] = a_idx
                        

                        # unstable_one_trial[tt] = self.isUnstable(
                        #     last_pulled)
                        unstable_one_trial[tt] = self.isUnOptimalStable(matching_player)
                        # print(last_pulled, unstable_one_trial[tt])


                for p_idx in range(self.num_players):
                    for a_idx in range(self.num_arms):
                        if players_count[p_idx][a_idx] == 0:
                            ucb[p_idx][a_idx] = float("inf")
                            lcb[p_idx][a_idx] = -float("inf")
                        else:
                            value = math.sqrt(
                                6 * math.log (2) / players_count[p_idx][a_idx])
                            ucb[p_idx][a_idx] = players_es_mean[p_idx][a_idx] + value
                            lcb[p_idx][a_idx] = players_es_mean[p_idx][a_idx] - value
                    rank_ucb = np.argsort(-ucb[p_idx])
                    rank_lcb = np.argsort(-lcb[p_idx])
                    if (not estimated_well[p_idx]) and (len(plau_set[p_idx]) > self.num_players - sum(estimated_well)):
                        for arm_i in plau_list[p_idx]:
                            for arm_j in plau_list[p_idx]:
                                if lcb[p_idx][arm_i] > ucb[p_idx][arm_j] and (arm_i in plau_set[p_idx]) and (arm_j in plau_set[p_idx]) and len(plau_set[p_idx]) > self.num_players - sum(estimated_well):
                                    plau_set[p_idx].remove(arm_j)
                    high_lcb = -1
                    high_ucb = -1
                    for ind in range(self.num_arms):
                        if rank_lcb[ind] in plau_set[p_idx]:
                            high_lcb = rank_lcb[ind]
                            break
                    for ind in range(self.num_arms):
                        if rank_ucb[ind] in plau_set[p_idx] and rank_ucb[ind] != high_lcb:
                            high_ucb = rank_ucb[ind]
                            break
                    if lcb[p_idx][high_lcb] > ucb[p_idx][high_ucb]:
                        estimated_well[p_idx] = 1
                        plau_set[p_idx] = {high_lcb}
                for p_idx in range(self.num_players):
                    if estimated_well[p_idx]:
                        a_opt = list(plau_set[p_idx])[0]
                        for p_prime in range(self.num_players):
                            if self.arms_rankings[a_opt].index(p_idx) < self.arms_rankings[a_opt].index(p_prime):
                                delete_arms[p_prime].add(a_opt)
                                if a_opt in plau_set[p_prime]:
                                    plau_set[p_prime] = full_set - delete_arms[p_prime]
                # if sum(estimated_well) == self.num_players:
                #     break
                # if tt % 1000 == 0:
                #     print(sum(estimated_well), unstable_one_trial[tt])
                #     print(plau_set)
                # if tt % 1000 == 0:
                #     print(lcb[0], lcb[1], lcb[2])
                tt += 1
                if tt == self.horizon:
                    break
            
            
            # while (True):
            #     if not self.is_super:
            #         self.shuffle_ranking(Beta)
            #     estimated_well = np.zeros(self.num_players, bool)
            #     low_limit = self.num_players + l + 2**l - 2
            #     high_limit = self.num_players + l + 2**(l + 1) - 2
            #     l += 1
            #     for round in range(low_limit, high_limit):
            #         if round == self.horizon - 1:
            #             flag = True
            #             break
            #         for p_idx in range(self.num_players):
            #             if not estimated_well[p_idx]:
            #                 At[p_idx] = (indexs[p_idx] + round -
            #                              1) % self.num_arms
            #         last_pulled = np.ones(self.num_arms)*(-1)
            #         for a_idx in range(self.num_arms):
            #             if a_idx in At:
            #                 # find most matching player for arm
            #                 for p_rank in range(self.num_players):
            #                     if At[self.arms_rankings[a_idx][p_rank]] == a_idx:
            #                         last_pulled[a_idx] = self.arms_rankings[a_idx][p_rank]
            #                         break
            #         # Here: whether stable matching according to last_pulled.
            #         last_pulled = last_pulled.astype(int)
            #         unstable_one_trial[round] = self.isUnstable_super(
            #             last_pulled)

            #         At = At.astype(int)
            #         for p_idx in range(self.num_players):
            #             if last_pulled[At[p_idx]] == p_idx:
            #                 # update
            #                 reward = np.random.normal(
            #                     loc=self.players_mean[p_idx][At[p_idx]], scale=1.0, size=None)

            #                 players_count[p_idx][At[p_idx]] += 1
            #                 players_es_mean[p_idx][At[p_idx]] += (
            #                     reward - players_es_mean[p_idx][At[p_idx]]) / players_count[p_idx][At[p_idx]]

            #                 # record
            #                 regrets_one_trial[p_idx][round] = max(
            #                     0, self.players_mean[p_idx][self.pessimal_matching[p_idx]] - self.players_mean[p_idx][At[p_idx]])
            #                 rewards_one_trial[p_idx][round] = self.players_mean[p_idx][At[p_idx]]
            #             else:
            #                 regrets_one_trial[p_idx][round] = self.players_mean[p_idx][self.pessimal_matching[p_idx]]
            #                 rewards_one_trial[p_idx][round] = 0

            #     # Compute UCB and LCB
            #     for p_idx in range(self.num_players):
            #         for a_idx in range(self.num_arms):
            #             if players_count[p_idx][a_idx] == 0:
            #                 ucb[p_idx][a_idx] = float("inf")
            #                 lcb[p_idx][a_idx] = -float("inf")
            #             else:
            #                 value = math.sqrt(
            #                     6 * math.log(h, 2) / players_count[p_idx][a_idx])
            #                 ucb[p_idx][a_idx] = players_es_mean[p_idx][a_idx] + value
            #                 lcb[p_idx][a_idx] = players_es_mean[p_idx][a_idx] - value
            #         rank_ucb = np.argsort(-ucb[p_idx])
            #         rank_lcb = np.argsort(-lcb[p_idx])
            #         for a_idx in range(self.num_arms):
            #             if a_idx < self.num_players - 1:
            #                 if lcb[p_idx][rank_lcb[a_idx]] > ucb[p_idx][rank_ucb[a_idx + 1]]:
            #                     estimated_well[p_idx] = True
            #                 else:
            #                     estimated_well[p_idx] = False
            #                     break
            #             elif a_idx >= self.num_players:
            #                 if lcb[p_idx][rank_lcb[self.num_players - 1]] > ucb[p_idx][rank_ucb[a_idx]]:
            #                     estimated_well[p_idx] = True
            #                 else:
            #                     estimated_well[p_idx] = False
            #                     break
            #         if estimated_well[p_idx] == True:
            #             current_player_ranking[p_idx] = rank_ucb

            #     start = self.num_players + high_limit

            #     for p_idx in range(self.num_players):
            #         if estimated_well[p_idx]:
            #             At[p_idx] = current_player_ranking[p_idx][0]

            #     count = 0
            #     for p_idx in range(self.num_players):
            #         if estimated_well[p_idx] == True:
            #             count += 1
            #     if count == self.num_players:
            #         break

            #     if flag:
            #         break

            # for round in range(start + 1, self.horizon):
            #     current_match = self.Gale_Shapley(current_player_ranking)
            #     #  Here: whether stable matching according to last_pulled.
            #     unstable_one_trial[round] = self.isUnstable_super(
            #         current_match)

            #     for a_idx, p_idx in enumerate(current_match):
            #         regrets_one_trial[p_idx][round] = max(
            #             0, self.players_mean[p_idx][self.pessimal_matching[p_idx]] - self.players_mean[p_idx][a_idx])
            #         rewards_one_trial[p_idx][round] = self.players_mean[p_idx][a_idx]

            for i in range(self.num_players):
                regrets_trials[i][trial] = regrets_one_trial[i]
                rewards_trials[i][trial] = rewards_one_trial[i]

            unstable_trials[trial] = unstable_one_trial
            print(sum(unstable_one_trial))


        

        np.savez('./ResultsData/DecenOpt_AdaptiveGS_Beta_'+str(Beta)+'N_' +
                 str(self.num_players)+'_Regret.npz', regret=regrets_trials)
        np.savez('./ResultsData/DecenOpt_AdaptiveGS_Beta_'+str(Beta)+'N_' +
                 str(self.num_players)+'_Reward.npz', reward=rewards_trials)
        np.savez('./ResultsData/DecenOpt_AdaptiveGS_Beta_'+str(Beta)+'N_' +
                 str(self.num_players)+'_Unstable.npz', unstable=unstable_trials)
        print(unstable_trials)
        cumulative_unstable = np.cumsum(np.array(unstable_trials), axis=1)
        for i in range(self.trials):
            print(cumulative_unstable[i][-1])

