function [best_theta,best_corr,cache] = generic_oracle(r,X,f,theta_space,...
    cache,delta,regtype,lambda,grad_eps,Ngradsteps,Nrestarts)
% [best_theta,best_corr] = generic_oracle(r,X,f,theta_space,delta,regtype,lambda,grad_eps,Ngradsteps,Nrestarts)
%  A function that will *not* be used directly
%  Rather, use it to create an anonymous function of r
%  generic_oracle uses gradient descent with numerical derivatives to find a theta
%   such that tilde-f_theta is highly correlated with r
% 
% Inputs:
%  - r, nx1 vector: the residual
%  - X, p x n design matrix X
%  - f, function handle of the form f(x,theta): specifies kernel family
%  - theta_space, D x K0 matrix: columns of theta_space form a grid over
%     the parameter space
%  - cache: can be left blank [], or a struct with the following fields
%   * thetas, D x K1 matrix: previously cached parameter points
%   * inds, 1 x K1 matrix: the theta_space cell index for cached thetas 
%   * F, n x K1 matrix: previously cached signal matrix
%   * nms, 1 x K1 matrix: norms of cached_F
%  - delta, scalar: normally the minimum spacing between points in theta_space
%  - regtype, 'none', 'L1', or 'unweighted': regularization type
%  - lambda, scalar: penalization constant
%  - grad_eps, scalar: controls the step size used for the numerical derivative
%     if grad_eps has a second element: controls multiplier (<1) used for line search
%  - Ngradsteps, positive integer: number of gradient steps
%     if Ngradsteps has a second element: controls number of line search steps
%  - Nrestarts, positive integer: number of restarts
%
% Outputs:
%  - best_theta, Dx1 vector: the best parameter found so far
%  - best_corr, scalar: its normalized inner produce with the residual
%  - cache: a struct with the following fields
%   * thetas, D x K1 matrix: previously cached parameter points
%   * inds, 1 x K1 matrix: the theta_space cell index for cached thetas 
%   * F, n x K1 matrix: previously cached signal matrix
%   * nms, 1 x K1 matrix: norms of cached_F
%
%  Example usage:
%   theta_space = .5 .* (fullfact([5 5])-3)'; % grid on [-1,1]^2
%   delta = .5;
%   thetas = sample_parameters(theta_space,delta,1); % samples a point  from [-1.25,1.25]^2
%   w = 1;
%   f = @(x,theta) exp(-norm(x-theta)^2); % gaussian kernel
%   X = randn(2,100);
%   regtype = 'none'; lambda = 0;
%   noise_type = 'gaussian'; noise=0.1; % specifies gaussian noise N(0, 0.01)
%   [y,y0] = generate_signal(X, thetas, w, f, noise_type, noise); % generates the kernel function plus noise
%   grad_eps = [1e-5 0.9]; % uses step size of 1e-5 for numerical derivative, multiplier of 0.9 for line search
%   Ngradsteps = [3 10]; % does 3 gradient steps, max 10 line search steps
%   Nrestarts = 100; % number of random restarts
%
%   % create the oracle tau as an anonymous function
%   tau = @(r,cache) generic_oracle(r,X,f,theta_space,cache,delta,regtype,lambda,grad_eps,Ngradsteps,Nrestarts);
%   [~, ~,~, ~, ~,cache] = nnls_fit(y,X,[],theta_space,f,regtype,lambda);
%   
%   [theta_est, corr_est] = tau(y,cache);   % find theta using noisy signal
%   [theta_est0,corr_est0] = tau(y0,cache); % find theta using noiseless signal
%   % compare the theta found by the oraclewith the true theta generating the data
%   [thetas, theta_est0, theta_est]
%   % compare their correlations
%   nmlz = @(v) v./norm(v); % normalizes a vector
%   corr_y_0 = nmlz(predictor_matrix(X, thetas(:,1), f, regtype, lambda))'*y;
%   corr_y0_0 = nmlz(predictor_matrix(X, thetas(:,1), f, regtype, lambda))'*y0;
%   [corr_y0_0, corr_est0; corr_y_0, corr_est]

% parse optional control parameters

n = size(r,1);
Nlinesteps = 5;
line_mult = 0.5;
if size(Ngradsteps,2) > 1;
  Nlinesteps = Ngradsteps(2);
end
if size(grad_eps) > 1;
  line_mult = grad_eps(2);
end

grad_eps = grad_eps(1);
Ngradsteps = Ngradsteps(1);

K0 = size(theta_space,2);

if size(cache,2)==0
  cached_thetas = theta_space;
  cached_inds = 1:K0;
  cached_F = predictor_matrix(X, theta_space, f, regtype, lambda);
  cached_nms = sqrt(sum(cached_F.^2));
  cache = struct('thetas',cached_thetas,'inds',cached_inds,'F',cached_F,'nms',cached_nms);
end

% useful functions

nmlz = @(v) v./norm(v); % normalizes a vector
% gets the correlation of f_theta with r
corrr = @(r,theta)  nmlz(predictor_matrix(X, theta, f, regtype, lambda))'*r ; 

%% Initialization

best_corr = -Inf;
best_theta = theta_space(:,1);
K0 = size(theta_space,2);
D = size(theta_space,1);

new_cache_thetas = zeros(D,Nrestarts);
new_cache_F = zeros(n,Nrestarts);
new_cache_inds = zeros(1,Nrestarts);
new_cache_nms = zeros(1,Nrestarts);
success_flags = zeros(1,Nrestarts);

%% Filtering step
% Computes correlations with all cached points
% The algorithm will go through sampled points in order of 

cache_corrs = r'* cache.F ./ cache.nms;
[~,o] = sort(-cache_corrs);

%% Main loop


for iter = 1:Nrestarts;
  % select starting point
  o_ind = o(mod(iter-1,K0)+1);
  theta_ind = cache.inds(o_ind);
  theta_center = theta_space(:, theta_ind);
  theta = cache.thetas(:, o_ind);

  % perturb starting point
  shift_size = delta/2 * min([1,exprnd(1)]) * rand(1);
  shift0 = theta-theta_center;
  temp = shift0+shift_size;
  shift0(temp > delta/2) = delta/2-shift_size;
  temp = -shift0+shift_size;
  shift0(temp > delta/2) = -delta/2+shift_size;
  assert(max(max([shift0+shift_size, -shift0+shift_size])) <= delta/2);
  theta = theta_center + shift0 + shift_size * (2*rand(D,1)-1);

  % compute the correlation and cache the results
  Fnew = predictor_matrix(X, theta, f, regtype, lambda);
  Fnm = norm(Fnew);
  corr0 = (r'* Fnew) / Fnm;
  new_cache_thetas(:,iter) = theta;
  new_cache_F(:,iter) = Fnew;
  new_cache_inds(iter) = theta_ind;
  new_cache_nms(:,iter) = Fnm;

  if corr0 > best_corr;
    best_corr=corr0;
    best_theta=theta;
  end
  if corr0 > cache_corrs(o_ind);
    success_flags(iter) = 1;
  end



  for graditer = 1:Ngradsteps;
    flag=1;
    % evaluate gradient
    grad = zeros(D,1);
    for d = 1:D;
      theta_temp = theta;
      theta_temp(d) = theta_temp(d)+grad_eps;
      grad(d) = (corrr(r,theta_temp) - corr0)./grad_eps;
    end
    if norm(grad)==0;
      flag=0;
    end
    % do a line search
    count = 0; 
    neg_limit = -(delta/2) - (theta - theta_center);
    pos_limit = (delta/2) - (theta - theta_center);
    % calculates step size to remain in the box
    step_size = min([1; neg_limit(grad < 0)./grad(grad< 0); pos_limit(grad > 0)./grad(grad > 0)]);
    if step_size <= 0;
      flag = 0;
    end
    while flag && (count < Nlinesteps);
      step_size = step_size * line_mult;
      count = count+1;
      theta_temp = theta + step_size.* grad;
      assert(max(abs(theta_temp - theta_center)) < delta/2);
      Fnew = predictor_matrix(X, theta_temp, f, regtype, lambda);
      Fnm = norm(Fnew);
      corr_temp = (r'* Fnew) / Fnm;
      if corr_temp > corr0;
        theta = theta_temp;
        corr0 = corr_temp;
        new_cache_thetas(:,iter) = theta;
        new_cache_F(:,iter) = Fnew;
        new_cache_inds(iter) = theta_ind;
        new_cache_nms(:,iter) = Fnm;
        flag = 0;
        success_flags(iter) = 1;
      end
      if corr_temp > best_corr;
        best_corr=corr_temp;
        best_theta=theta_temp;
      end
    end
    % END line search
  end
  % END gradient descent
end

sff = success_flags ==1;
cache = struct('thetas',[cache.thetas,new_cache_thetas(:,sff)],...
               'inds', [cache.inds, new_cache_inds(sff)], ...
               'F',[cache.F, new_cache_F(:,sff)],...
               'nms',[cache.nms, new_cache_nms(sff)]);
end
