bquast · May 18, 2023 13:50
diff --git a/attention.R b/attention.R
 # attention.R
 # Bastiaan Quast
 # bquast@gmail.com

 # based on:
 # https://machinelearningmastery.com/the-attention-mechanism-from-scratch/

 # encoder representations of four different words
 word_1 = matrix(c(1,0,0), nrow=1)
 word_2 = matrix(c(0,1,0), nrow=1)
 word_3 = matrix(c(1,1,0), nrow=1)
 word_4 = matrix(c(0,0,1), nrow=1)

 # stacking the word embeddings into a single array
 words = rbind(word_1,
              word_2,
              word_3,
              word_4)

 # generating the weight matrices
 set.seed(42)
 W_Q = matrix(floor(runif(9, min=0, max=3)),nrow=3,ncol=3)
 W_K = matrix(floor(runif(9, min=0, max=3)),nrow=3,ncol=3)
 W_V = matrix(floor(runif(9, min=0, max=3)),nrow=3,ncol=3)


 # redefine matrices to match random numbers generated by Python in the original code
 W_Q = matrix(c(2,0,2,
               2,0,0,
               2,1,2),
             nrow=3,
             ncol=3,
             byrow = TRUE)
 W_K = matrix(c(2,2,2,
               0,2,1,
               0,1,1),
             nrow=3,
             ncol=3,
             byrow = TRUE)
 W_V = matrix(c(1,1,0,
               0,1,1,
               0,0,0),
             nrow=3,
             ncol=3,
             byrow = TRUE)


 # generating the queries, keys and values
 Q = words %*% W_Q
 K = words %*% W_K
 V = words %*% W_V

 # scoring the query vectors against all key vectors
 scores = Q %*% t(K)

 # calculate the max for each row of the scores matrix
 maxs = as.matrix(apply(scores, MARGIN=1, FUN=max))

 # initialize weights matrix
 weights = matrix(0, nrow=4, ncol=4)

 # computing the weights by a softmax operation
 for (i in 1:dim(scores)[1]) {
  weights[i,] = exp((scores[i,]-maxs[i,]) / ncol(K) ^ 0.5)/sum(exp((scores[i,]-maxs[i,]) / ncol(K) ^ 0.5))
 }

 # computing the attention by a weighted sum of the value vectors
 attention = weights %*% V

 print(attention)
	# attention.R
	# Bastiaan Quast
	# bquast@gmail.com

	# based on:
	# https://machinelearningmastery.com/the-attention-mechanism-from-scratch/

	# encoder representations of four different words
	word_1 = matrix(c(1,0,0), nrow=1)
	word_2 = matrix(c(0,1,0), nrow=1)
	word_3 = matrix(c(1,1,0), nrow=1)
	word_4 = matrix(c(0,0,1), nrow=1)

	# stacking the word embeddings into a single array
	words = rbind(word_1,
	word_2,
	word_3,
	word_4)

	# generating the weight matrices
	set.seed(42)
	W_Q = matrix(floor(runif(9, min=0, max=3)),nrow=3,ncol=3)
	W_K = matrix(floor(runif(9, min=0, max=3)),nrow=3,ncol=3)
	W_V = matrix(floor(runif(9, min=0, max=3)),nrow=3,ncol=3)


	# redefine matrices to match random numbers generated by Python in the original code
	W_Q = matrix(c(2,0,2,
	2,0,0,
	2,1,2),
	nrow=3,
	ncol=3,
	byrow = TRUE)
	W_K = matrix(c(2,2,2,
	0,2,1,
	0,1,1),
	nrow=3,
	ncol=3,
	byrow = TRUE)
	W_V = matrix(c(1,1,0,
	0,1,1,
	0,0,0),
	nrow=3,
	ncol=3,
	byrow = TRUE)


	# generating the queries, keys and values
	Q = words %*% W_Q
	K = words %*% W_K
	V = words %*% W_V

	# scoring the query vectors against all key vectors
	scores = Q %*% t(K)

	# calculate the max for each row of the scores matrix
	maxs = as.matrix(apply(scores, MARGIN=1, FUN=max))

	# initialize weights matrix
	weights = matrix(0, nrow=4, ncol=4)

	# computing the weights by a softmax operation
	for (i in 1:dim(scores)[1]) {
	weights[i,] = exp((scores[i,]-maxs[i,]) / ncol(K) ^ 0.5)/sum(exp((scores[i,]-maxs[i,]) / ncol(K) ^ 0.5))
	}

	# computing the attention by a weighted sum of the value vectors
	attention = weights %*% V

	print(attention)