
%% RECover Borkar's Q-Whittle Learning
clear all
% learning parameters


epsilon = 0.5;  % exploration probability (1-epsilon = exploit / epsilon = explore)
% states
state = [1,2,3,4];
% actions
action = [0,1];
% initial Q matrix
Q =-1*ones(length(state),length(action));
K = 10000;     % maximum number of the iterations
 

W=zeros( length(state),K);
%% the main loop of the algorithm


 
 MC=100;




for s=1:1:length(state) 
    

Wmc=zeros(MC, K);
for mc=1:MC
    lambda=0; % Initialize whittle index as 0

for k = 1:K
    disp(['iteration: ' num2str(k)]);

    if k==1
    state_idx =s;
    end


    r=rand; % get 1 uniform random number
    x=sum(r>=cumsum([0, 1-epsilon, epsilon])); % check it to be in which probability area
    
   
    if x == 1   % exploit
        [~,umax]=max(Q(state_idx,:));
        current_action = action(umax);
    else        % explore
        current_action=datasample(action,1); % choose 1 action randomly (uniform random distribution)
    end
    
    
    
    action_idx = find(action==current_action); % id of the chosen action
    % observe the next state and next reward ** there is no reward matrix
   
  
    
    [next_state,next_reward] = model(state(state_idx),action(action_idx), lambda);
    next_state_idx = find(state==next_state);  % id of the next state
    % print the results in each iteration
    disp(['current state : ' num2str(state(state_idx))  ' taken action : ' num2str(action(action_idx)) ' next state : ' num2str(state(next_state_idx))]);
    disp([' next reward : ' num2str(next_reward)]);
    % update the Q matrix using the Q-learning rule
     
    Q(state_idx,action_idx) = Q(state_idx,action_idx) + 1/sqrt(k)* (next_reward +  max(Q(next_state_idx,:)) -1/8*sum(sum(Q))- Q(state_idx,action_idx));
    % if the robot is stuck in terminals
   
    if state_idx==s
    lambda=lambda+1/sqrt(k)*(Q(s,2)-Q(s,1));
    end

    state_idx = next_state_idx;
   
  
    disp(Q);  % display Q in each level
    disp(lambda)
    Wmc(mc,k)=lambda;
end



% display the final Q matrix
disp('Final Q matrix : ');
disp(Q)
% A{mode}=Q;
[C,I]=max(Q,[],2);                              % finding the max values
disp('Q(optimal):');
disp(C);
disp(I)
disp('Optimal Policy');
disp('*');
disp(W);
disp('*');
disp('Whittle Index');
end  
for k=1:K
W(s,k)=mean(Wmc(:,k));
end
end
 
 
figure;
plot(10:10:K, W(1,10:10:K), 10:10:K, W(2,10:10:K),10:10:K, W(3,10:10:K), 10:10:K, W(4,10:10:K));
legend("s1", "s2", "s3", "s4");

%%

%% This function is used as an observer to give the next state and the next reward using the current state and action
function [next_state,r] = model(s,a, lambda)
 t=rand;
if s==1 && a==0
    next_state=s+3*(t>0.5);
    r=-1+(1-a)*lambda;
elseif s==1 && a==1
    next_state=s+1*(t>0.5);
    r=-1+(1-a)*lambda;
elseif s==2 && a==0
    next_state=s-1*(t>0.5);
    r=0+(1-a)*lambda;
elseif s==2 && a==1
    next_state=s+1*(t>0.5);
    r=0+(1-a)*lambda;   
elseif s==3 && a==0
    next_state=s-1*(t>0.5);
    r=0+(1-a)*lambda;
elseif s==3 && a==1
    next_state=s+1*(t>0.5);
    r=0+(1-a)*lambda;   

elseif s==4 && a==0
    next_state=s-1*(t>0.5);
    r=1+(1-a)*lambda;
elseif s==4 && a==1
    next_state=s-3*(t>0.5);
    r=1+(1-a)*lambda;   
end

end