#' @title Double Thompson Sampling Learner
#'
#' @include Learner.R
#'
#' @description
#' This Learner specializes [Learner] to match the Double Thompson Sampling algorithm:
#'
DTS = R6Class("DTS", inherit = Learner,
                       public = list(
                         
                         #' @field B_mat (`numeric(n,n)`)\cr
                         #' Stores the estimated preference matrix .
                         B_mat = NULL,
                         
                         #' @field U_mat (`numeric(n,n)`)\cr
                         #' Stores the upper confidence preference matrix .
                         U_mat = NULL,
                         
                         #' @field L_mat (`numeric(n,n)`)\cr
                         #' Stores the lower confidence preference matrix .
                         L_mat = NULL,
                         
                         #' @field zeta (`numeric()`)\cr
                         #' some temporary variable.
                         zeta = NULL,
                         
                         #' @field alpha (`numeric()`)\cr
                         #' hyperparmeter to steer the degree of optimism.
                         alpha = NULL,
                         
                         #' @field n (`integer()`)\cr
                         #' number of arms.
                         n = NULL,
                         
                         
                         #' @description
                         #' Creates a new instance of this [R6][R6::R6Class] class.
                         initialize = function(data_model_specs = list(num_arms = 2, dim = 1), aggregation, alpha){
                           super$initialize(aggregation = aggregation, action_size = 2)
                           
                           self$n           = data_model_specs$num_arms
                           self$B_mat       = matrix(rep(0,self$n^2),ncol=self$n)
                           self$U_mat       = matrix(rep(1,self$n^2),ncol=self$n)
                           self$L_mat       = matrix(rep(0,self$n^2),ncol=self$n)
                           self$zeta        = 1/(self$n-1)*apply(self$U_mat>1/2,1,sum )
                           self$alpha       = alpha
                           
                         },
                         
                         action = function(data_model) {
                           selection = c()
                           
                           
                           C			=	which(max(self$zeta)==self$zeta) 
                           theta  = matrix(rep(0,self$n^2),ncol=self$n)
                           # posterior sampling
                           for( i in 1:(self$n-1)){
                             for(j in 2:self$n){
                               theta[i,j]		= rbeta(1,self$B_mat[i,j]+1,self$B_mat[j,i]+1)
                               theta[j,i]		= 1-theta[i,j]
                             }
                           }
                           
                           # Copeland scores on Posterior samples
                           zeta_tilde	=	apply(theta>1/2,1,sum )	
                           zeta_tilde[-C] = 0
                           # 1st candidate
                           if (length(C)>1){
                             a1			=	sample(which(max(zeta_tilde)==zeta_tilde),size=1)
                           }
                           else{
                             a1     = C
                           }
                           
                           # posterior sampling
                           temp		    =	0
                           theta_sec  = zeta_tilde
                           for (i in ((1:self$n)[-a1])){
                             
                             theta_sec[i]	=	rbeta(1,self$B_mat[i,a1]+1,self$B_mat[a1,i]+1)
                             if((theta_sec[i]>temp) & (self$L_mat[i,a1]<=0.5) ){
                               temp		= theta_sec[i]
                               a2			= i
                             }
                             
                           }
                           
                           # if no candidate with lower bound for CI smaller than 1/2 exists, then fully commitment to a1
                           if(temp==0){
                             a2	=	a1
                           }
                           

                           return (c(a1,a2))
                           
                         },
                         
                         
                         update = function(chosen_arms,data_model) { 
                           temp = data_model$getFeedback(c(chosen_arms[1],chosen_arms[2]),self$timestep)
                               if (temp==TRUE){
                                 pick					=	chosen_arms[1]
                                 not_picked		=	chosen_arms[2]
                               }
                               else{
                                 pick					=	chosen_arms[2]
                                 not_picked		=	chosen_arms[1]
                               }
                           # update estimates
                           self$B_mat[pick,not_picked]		=	self$B_mat[pick,not_picked]+1										#	update the win matrix
                           # update the confidence intervals
                           self$U_mat[pick,not_picked]		= 	self$B_mat[pick,not_picked]/(self$B_mat[pick,not_picked] + self$B_mat[not_picked,pick]) 	+ sqrt(		(self$alpha*log(self$timestep))/(self$B_mat[pick,not_picked] + self$B_mat[not_picked,pick])			)
                           self$U_mat[not_picked,pick]		= 	self$B_mat[not_picked,pick]/(self$B_mat[pick,not_picked] + self$B_mat[not_picked,pick]) 	+ sqrt(		(self$alpha*log(self$timestep))/(self$B_mat[pick,not_picked] + self$B_mat[not_picked,pick])			)
                           self$L_mat[pick,not_picked]		= 	self$B_mat[pick,not_picked]/(self$B_mat[pick,not_picked] + self$B_mat[not_picked,pick]) 	- sqrt(		(self$alpha*log(self$timestep))/(self$B_mat[pick,not_picked] + self$B_mat[not_picked,pick])			)
                           self$L_mat[not_picked,pick]		= 	self$B_mat[not_picked,pick]/(self$B_mat[pick,not_picked] + self$B_mat[not_picked,pick]) 	- sqrt(		(self$alpha*log(self$timestep))/(self$B_mat[pick,not_picked] + self$B_mat[not_picked,pick])			)
                           
                           
                         }
                       )
)