%% DEMONSTRATION FILE
%
% The purpose of this file is to illustrate the method via a sequence of
% synthetic studies and evaluation metrics. It will follow closely the
% synthetic experiments done in the companion paper.
%
% In what follows, we will show how to generate synthetic models and
% problems, how to run the model, how to run summaries for the whole 
% procedure and how to visualize some of the results.
%
% It is assumed that prior to running any of the following, the script file
% "setpath.m" has been executed. 
%
% Version 1.0, May 5th 2016

%% PART 0: SETUP DIRECTORIES

% We will need two directories, one where synthetic data will be generated
% and one where output results will be stored. CHANGE THIS ACCORDING TO
% YOUR OWN CONFIGURATION.

data_dir   = 'C:/temp/gp_obsint/data';
result_dir = 'C:/temp/gp_obsint/results';

%% PART 1: SETUP EXPERIMENTAL PARAMETERS

% Each synthetic model follows a causal graph with the structure
%
% - (Z_1, ... Z_p) has no independence constraints
% - X <- {Z_1, ..., Z_p}
% - Y <- {X, Z_1, ..., Z_p}
%
% where in the above A <- B means that set B is the parent set of A. The
% actual parameters used in this structure are described in function
% "synth/generate_problem.m" and in the paper. Here, X is treatment and Y
% is outcome, while covariates Z are common causes.
%
% Model + datasets will be generated for every possible combination of
% response function degree and signal level, as defined below.

% As in the companion paper, we will set up a simulation study that
% generates polynomial dose-response functions. In the paper, we
% had quadratic and cubic polynomials. This will be set as follows:

degree_choice = [2 3];

% We need a set of "signal levels", a relation between variability in the 
% signal and the noise of the outcome variable. See 
% "synth/generate_problem.m" for details, as well as the paper. 

signal_levels = [0.10 0.25];

% We also need the sample size n for the corresponding observational
% datasets, the number of covariates p which are common causes of treatment
% and outcome.

n = 1000; % Decrease this if you want the demo to run faster
p = 25;   % Decrease this if you want the demo to run faster, but make sure
          % that num_Z_sel in PART 4 is set to be less than p.

% We need a "likelihood range", an interval in which we uniformly sample 
% error variances for the likelihood function. This will be proportional to the 
% amount of signal. The following vector specifies the lower and
% upper bound of this space.

lik_range = [0.2, 0.4];

% When generating datasets, reject model if level of confounding is not
% strong enough. See "synth/generate_problem.m" for details on how this is
% calculated.

confound_strength = 0.2;

% We need to know the number of different dosage levels we will allow in
% our simulated intervention generator.

num_X_space = 20;

% Set of number of sample sizes generated in the interventional regime.

num_total_exps = [40 100 200];

% Finally, we need the number of synthetic problems to be generated for
% every combination of degree and signal level. In the paper, we had 50
% problems. For the sake of demonstration, we set the below to 2 only.
% Change it back to 50 to reconstruct the same datasets used in the paper.

num_problems = 2;

%% PART 2: GENERATE MODELS AND DATA

% Check main directory exists first

if ~exist(data_dir, 'dir')
  error('ERROR: given data directory %s needs to be created first.', data_dir)
end

% First, load a seed file that will allow the reconstruction of the same
% datasets as in the paper.

load('synth/seed.mat', 'seed'); %  Load 'seed'

% This is done for every combination of degree choice and signal level.

for d = degree_choice
  for s = signal_levels
    dir_name = strcat(data_dir, '/poly', num2str(d), '_signal', num2str(num2str(100 * (1 - s))));
    if ~exist(dir_name, 'dir')
      mkdir(dir_name)
    end
    fprintf('[GENERATING DATA (degree = %d, signal_level = %d)]\n', d, 100 * (1 - s))
    generate_problems(num_problems, dir_name, p, n, d, s, lik_range, confound_strength, seed)
  end
end

%% PART 3: PREPARATION FOR RUNNING EXPERIMENTS

% The next stage is to sort the covariate space Z by how 'influential' 
% they are regarding confounding X and Y. This information will be useful 
% when benchmarking problems by how badly the assumption of unmeasured 
% confounding is violated (namely, by removing the "strongest" confounders 
% the observational data will induce more bias).
%
% The following process is very slow, though. It can be easily parallelized
% but the code below doesn't attempt any parallelization. In the interest
% of time, it can be skipped if one does not want to run the "sorted Z"
% experiments of the next section.

fprintf('* WARNING * THE FOLLOWING WILL BE VERY SLOW\n')
num_iter_sorting = 150;

for d = degree_choice
  for s = signal_levels
    dir_name = strcat(data_dir, '/poly', num2str(d), '_signal', num2str(num2str(100 * (1 - s))));
    fprintf('[COVARIATE SORTING (degree = %d, signal_level = %d)]\n', d, 100 * (1 - s))
    batch_Z_sorting(dir_name, num_X_space, num_iter_sorting, true)
  end
end

%% PART 4: RUNNING EXPERIMENTS

% We get to the point where the observational-interventional learning is
% done. Two variations, on top of the degree and signal level: we go
% through the case where unmeasured confounding is introduced by a random
% selection of observable covariates (sorted_Z = false) or when a selection
% of possibly "strong" confounders has been hidden.
%
% The following process is very slow, though. It can be easily parallelized
% but the code below doesn't attempt any parallelization. If the previous 
% section (PART 3) has not been executed, the following code will still 
% work, provided sorted_Z is not allowed to be set to true.

fprintf('* WARNING * THE FOLLOWING WILL BE VERY SLOW\n')

M         = 5000; % Number of MCMC iterations
burn_in   = 500;  % Number of burn in iterations to be discarded
num_iter  = 300;  % Number of optimizations steps
num_Z_sel = 15;   % Number of covariates to be provided as observables

if ~exist(result_dir, 'dir')
  error('ERROR: given result directory %s needs to be created first.', result_dir)
end

for d = degree_choice
  for s = signal_levels
    path_input = strcat(data_dir, '/poly', num2str(d), '_signal', num2str(num2str(100 * (1 - s))));
    path_output = strcat(result_dir, '/poly', num2str(d), '_signal', num2str(num2str(100 * (1 - s))));
    if ~exist(path_output, 'dir')
      mkdir(path_output)
    end
    for sorted_Z = [false true]
      fprintf('[INFERENCE (degree = %d, signal_level = %d, sorted Z = %d)]\n', d, 100 * (1 - s), sorted_Z)
      batch_synth_run_and_compare(path_input, path_output, num_Z_sel, sorted_Z, M, burn_in, num_total_exps, num_X_space, num_iter, 1)
    end
  end
end

%% PART 5: SUMMARIZING RESULTS

% We get to the point where the observational-interventional learning has
% been recorded, now we provide a report for every possible configuration.
%
% The "mean abs" error, as discussed in the paper, is the absolute diffence
% between the estimated dose-response curve and the true curve, averaged
% over treatment levels.

for d = degree_choice
  for s = signal_levels
      
    path_input = strcat(data_dir, '/poly', num2str(d), '_signal', num2str(num2str(100 * (1 - s))));
    path_output = strcat(result_dir, '/poly', num2str(d), '_signal', num2str(num2str(100 * (1 - s))));
    
    for sorted_Z = [false true]            
      fprintf('[SUMMARY OF RESULTS (degree = %d, signal_level = %d, sorted Z = %d)]\n\n', d, 100 * (1 - s), sorted_Z)
      batch_summarize_results(path_input, path_output, sorted_Z);
      fprintf('********\n\n')
    end
    
  end
end

%% PART 6: SAMPLE VISUALIZATION

% This will generate many plots for a particular problem instance (in that
% case, the first problem of degree 2 and signal level 0.1, without
% pre-sorting the covariates).
%
% Here, green curves are the truth, and dashed red curves are the means
% according to the generated distribution (be it prior or posterior). All
% samples are shown. The magenta line in the "observational data" plot is
% the curved fitted to the observational data without any covariate
% adjustment. Differences between the green and magenta line are mostly
% due, besides those expected by statistical variability, are due to
% unmeasured confounding. 
%
% If 'plot_a_hat' is set to true, it will also display an "true" distortion 
% function in the "distortion only" plots, as a green curve. The "true"
% distortion is the ratio between the observational dose-response curve and 
% the true dose-response. If estimation is good, the red and green lines in 
% the "distortion only" plots should overlap. However, the "true"
% distortion may vary abruptly at some points if estimation is bad, and
% this will be hard to visualize. For that reason, in the example below we
% set 'plot_a_hat' to false.

d = degree_choice(2);
s = signal_levels(1);
sorted_Z = false;
problem_number = 2;
v = 3;                % Which interventional set to use
num_prior = 5000;     % Number of samples generated for prior visualization

path_input = strcat(data_dir, '/poly', num2str(d), '_signal', num2str(num2str(100 * (1 - s))));
path_output = strcat(result_dir, '/poly', num2str(d), '_signal', num2str(num2str(100 * (1 - s))));
    
synth_visualize_problem(path_input, path_output, sorted_Z, problem_number, num_prior, v)
