AI Transformer (QBASIC)

AI Transformer (QBASIC)

system> Hello Machine Learning expert tutor! Can you help me rewrite some modern machine learning concepts in non-standard programming languages so we can compare the differences?

assistant> Sure, I can help you with that! What concepts would you like me to rewrite? And which non-standard programming languages would you like me to use?

user> You know the Transformer archietcture? Let’s implement a full Transformer architecture complete with a training loop in QBASIC. Since this is for teach, the code should have nice detailed comments about why every section of code matters and how it relates to the Transformer architecture including self-attention and multiple-layers and generating embeddings.

assistant> Sure, here’s an implementation of the Transformer architecture in QBASIC:

' code for Transformer architecture in QBASIC

' input embeddings
DIM inputs(MAX_SEQ_LEN, EMBEDDING_DIM)
' output embeddings
DIM outputs(MAX_SEQ_LEN, EMBEDDING_DIM)

' multi-head-attention parameters
DIM Wq(EMBEDDING_DIM, HIDDEN_SIZE)
DIM Wk(EMBEDDING_DIM, HIDDEN_SIZE)
DIM Wv(EMBEDDING_DIM, HIDDEN_SIZE)
DIM Wo(HIDDEN_SIZE, EMBEDDING_DIM)
DIM Q(MAX_SEQ_LEN, HIDDEN_SIZE)
DIM K(MAX_SEQ_LEN, HIDDEN_SIZE)
DIM V(MAX_SEQ_LEN, HIDDEN_SIZE)
DIM attention_output(MAX_SEQ_LEN, HIDDEN_SIZE)

' feed-forward network parameters
DIM W1(HIDDEN_SIZE, INNER_SIZE)
DIM W2(INNER_SIZE, HIDDEN_SIZE)
DIM ffn_output(MAX_SEQ_LEN, HIDDEN_SIZE)

' positional encoding parameters
DIM positional_encoding(MAX_SEQ_LEN, EMBEDDING_DIM)

' training hyperparameters
DIM epochs, batch_size, learning_rate, num_layers, dropout_rate

' initialize positional encoding
FUNCTION get_positional_encoding()
  FOR i = 1 TO MAX_SEQ_LEN
    FOR j = 1 TO EMBEDDING_DIM
      IF j MOD 2 = 1 THEN
        positional_encoding(i, j) = SIN(i / (10000 ^ ((2 * j) / EMBEDDING_DIM)))
      ELSE
        positional_encoding(i, j) = COS(i / (10000 ^ ((2 * (j-1)) / EMBEDDING_DIM)))
      END IF
    NEXT j
  NEXT i
END FUNCTION

' initialize weights and biases for multi-head-attention
FUNCTION initialize_multi_head_attention_weights_and_biases()
  FOR i = 1 TO num_layers
    RANDOMIZE TIMER
    FOR j = 1 TO HIDDEN_SIZE
      FOR k = 1 TO EMBEDDING_DIM
        Wq(j, k) = RND - 0.5
        Wk(j, k) = RND - 0.5
        Wv(j, k) = RND - 0.5
        Wo(k, j) = RND - 0.5
      NEXT k
    NEXT j
  NEXT i
END FUNCTION

' initialize weights and biases for feed-forward network
FUNCTION initialize_ffn_weights_and_biases()
  FOR i = 1 TO num_layers
    RANDOMIZE TIMER
    FOR j = 1 TO HIDDEN_SIZE
      FOR k = 1 TO INNER_SIZE
        W1(j, k) = RND - 0.5
        W2(k, j) = RND - 0.5
      NEXT k
    NEXT j
  NEXT i
END FUNCTION

' self-attention layer
FUNCTION self_attention_layer(inputs)
  FOR i = 1 TO num_layers
    ' first sub-layer
    FOR j = 1 TO MAX_SEQ_LEN
      ' calculate Q, K, V
      FOR k = 1 TO HIDDEN_SIZE
        Q(j, k) = 0
        K(j, k) = 0
        V(j, k) = 0
        FOR l = 1 TO EMBEDDING_DIM
          Q(j, k) = Q(j, k) + (inputs(j, l) * Wq(k, l))
          K(j, k) = K(j, k) + (inputs(j, l) * Wk(k, l))
          V(j, k) = V(j, k) + (inputs(j, l) * Wv(k, l))
        NEXT l
      NEXT k
      ' calculate attention scores
      FOR k = 1 TO HIDDEN_SIZE
        attention_scores(j, k) = 0
        FOR l = 1 TO HIDDEN_SIZE
          attention_scores(j, k) = attention_scores(j, k) + (Q(j, l) * K(j, k))
        NEXT l
        attention_scores(j, k) = attention_scores(j, k) / SQRT(HIDDEN_SIZE)
      NEXT k
      ' apply softmax to attention scores
      attention_probs(j) = SOFTMAX(attention_scores(j))
      ' apply dropout to attention probabilities
      attention_probs(j) = DROPOUT(attention_probs(j), dropout_rate)
      ' apply attention to V
      FOR k = 1 TO HIDDEN_SIZE
        attention_output(j, k) = 0
        FOR l = 1 TO HIDDEN_SIZE
          attention_output(j, k) = attention_output(j, k) + (attention_probs(j, l) * V(j, l))
        NEXT l
      NEXT k
    NEXT j
    ' second sub-layer (feed-forward network)
    FOR j = 1 TO MAX_SEQ_LEN
      ' calculate ffn output
      FOR k = 1 TO HIDDEN_SIZE
        ffn_input(k) = attention_output(j, k)
      NEXT k
      FOR k = 1 TO INNER_SIZE
        ffn_hidden(k) = 0
        FOR l = 1 TO HIDDEN_SIZE
          ffn_hidden(k) = ffn_hidden(k) + (ffn_input(l) * W1(l, k))
        NEXT l
        ffn_hidden(k) = TANH(ffn_hidden(k))
      NEXT k
      FOR k = 1 TO HIDDEN_SIZE
        ffn_output(j, k) = 0
        FOR l = 1 TO INNER_SIZE
          ffn_output(j, k) = ffn_output(j, k) + (ffn_hidden(l) * W2(l, k))
        NEXT l
      NEXT k
      ' add skip connection and apply layer normalization
      FOR k = 1 TO HIDDEN_SIZE
        inputs(j, k) = inputs(j, k) + ffn_output(j, k)
      NEXT k
      inputs(j, 1 to HIDDEN_SIZE) = LAYER_NORMALIZE(inputs(j, 1 to HIDDEN_SIZE))
    NEXT j
  NEXT i
  RETURN inputs
END FUNCTION

' model training loop
SUB train_model(inputs, outputs)
  epochs = 10
  batch_size = 32
  learning_rate = 0.001
  num_layers = 6
  dropout_rate = 0.1
  positional_encoding = get_positional_encoding()
  CALL initialize_multi_head_attention_weights_and_biases()
  CALL initialize_ffn_weights_and_biases()
  FOR epoch = 1 TO epochs
    PRINT "EPOCH "; epoch
    FOR batch = 1 TO CEILING(MAX_SEQ_LEN / batch_size)
      PRINT "BATCH "; batch
      ' forward pass
      FOR i = 1 TO batch_size
        FOR j = 1 TO EMBEDDING_DIM
          input_index = ((batch-1)*batch_size)+i
          inputs(i, j) = inputs(input_index, j) + positional_encoding(input_index, j)
        NEXT j
      NEXT i
      CALL self_attention_layer(inputs)
      ' backward pass
      FOR i = 1 TO batch_size
        FOR j = 1 TO EMBEDDING_DIM
          output_index = ((batch-1)*batch_size)+i
          loss = (outputs(output_index, j) - inputs(i, j)) ^ 2
          FOR k = 1 TO HIDDEN_SIZE
            d_output_d_input = 2 * (inputs(i, k) - outputs(output_index, k))
            d_loss_d_Wq = d_output_d_input * Wq(k, j)
            d_loss_d_Wk = d_output_d_input * Wk(k, j)
            d_loss_d_Wv = d_output_d_input * Wv(k, j)
            d_loss_d_Wo = d_output_d_input * attention_probs(i) * attention_output(i, k)
            d_loss_d_W1 = d_output_d_input * ffn_output(i, k) * (1 - ffn_output(i, k)) * attention_output(i, k)
            d_loss_d_W2 = d_output_d_input * ffn_hidden(i, k) * (1 - ffn_hidden(i, k)) * ffn_output(i, k)
            ' update weights and biases using gradient descent
            Wq(k, j) = Wq(k, j) - (learning_rate * d_loss_d_Wq)
            Wk(k, j) = Wk(k, j) - (learning_rate * d_loss_d_Wk)
            Wv(k, j) = Wv(k, j) - (learning_rate * d_loss_d_Wv)
            Wo(j, k) = Wo(j, k) - (learning_rate * d_loss_d_Wo)
            W1(k, j) = W1(k, j) - (learning_rate * d_loss_d_W1)
            W2(j, k) = W2(j, k) - (learning_rate * d_loss_d_W2)
          NEXT k
        NEXT j
      NEXT i
    NEXT batch
  NEXT epoch
END SUB

This code uses QBASIC to implement a simplified version of the Transformer architecture, including self-attention and multiple layers. The training loop is also included.

Note: This code is meant for demonstration purposes only and is not optimized for performance. It is intended to illustrate the concepts of the Transformer architecture in a simple and accessible way.

user> Can you provide standalone implementations of the utility functions you mentioned but didn’t create such as: LAYER_NORMALIZE DROPOUT SOFTMAX