tjake · tjake · Oct 23, 2024 · Oct 23, 2024
diff --git a/jlama-core/src/main/java/com/github/tjake/jlama/model/AbstractModel.java b/jlama-core/src/main/java/com/github/tjake/jlama/model/AbstractModel.java
@@ -36,6 +36,7 @@
 import com.google.common.base.Preconditions;
 import com.google.common.primitives.Ints;
 
+import java.nio.ByteOrder;
 import java.nio.FloatBuffer;
 import java.util.*;
 import java.util.concurrent.ThreadLocalRandom;
@@ -387,6 +388,33 @@ public Map<String, Float> classify(String input, PoolingType poolingType) {
         return result;
     }
 
+    public float[] getLogits(AbstractTensor output) {
+        try (AbstractTensor embedding = sampleOutput.getOutputLayerNorm().forward(output);
+             AbstractTensor logits = makeDenseTensor(1, c.vocabularySize)) {
+
+            VectorMath.pchunk(0, c.vocabularySize, (chunkStart, chunkSize) -> {
+                TensorOperationsProvider.get()
+                        .dotProductChunk(
+                                logits,
+                                embedding,
+                                sampleOutput.getOutputLogitsWeights(),
+                                0,
+                                c.embeddingLength,
+                                chunkStart,
+                                chunkSize);
+            });
+
+            VectorMath.softMax(logits, 0, c.vocabularySize);
+
+            float[] r = new float[c.vocabularySize];
+
+            //Convert from Tensor to float array
+            logits.getMemorySegment().asByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer().get(r);
+
+            return r;
+        }
+    }
+
     public int sample(AbstractTensor output, float temperature, float uniformSample, AbstractTensor logits) {
         try (AbstractTensor embedding = sampleOutput.getOutputLayerNorm().forward(output)) {
             // This is a mix of argmax and sampling with softmax
@@ -433,6 +461,22 @@ public int sample(AbstractTensor output, float temperature, float uniformSample,
         }
     }
 
+    public int[] encodePrompt(PromptContext promptContext) {
+        long[] encoded = tokenizer.encode(promptContext.getPrompt());
+
+        // Remove BOS token if it's the first token, we explicitly add it below
+        if (encoded.length > 0 && encoded[0] == c.bosToken) {
+            encoded = Arrays.copyOfRange(encoded, 1, encoded.length);
+        }
+
+        int[] promptTokens  = new int[(1 + encoded.length)];
+        promptTokens[0] = c.bosToken;
+        for (int i = 1; i <= encoded.length; i++)
+            promptTokens[i] = Ints.checkedCast(encoded[i - 1]);
+
+        return promptTokens;
+    }
+
     @Override
     public Response generate(
         UUID sessionId,

diff --git a/jlama-core/src/main/java/com/github/tjake/jlama/model/gemma2/Gemma2Config.java b/jlama-core/src/main/java/com/github/tjake/jlama/model/gemma2/Gemma2Config.java
@@ -34,7 +34,7 @@ public Gemma2Config(
         @JsonProperty("rms_norm_eps") float layerNormEps,
         @JsonProperty("vocab_size") int vocabularySize,
         @JsonProperty("bos_token_id") int bosToken,
-        @JsonProperty("eos_token_id") List<Integer> eosTokens,
+        @JsonProperty("eos_token_id") Object eosTokens,
         @JsonProperty("hidden_act") ActivationFunction.Type activationFunction,
         @JsonProperty("rope_theta") Double ropeFreqsTheta,
         @JsonProperty("rope_scaling") Map<String, String> ropeScaling,
@@ -52,7 +52,7 @@ public Gemma2Config(
             layerNormEps,
             vocabularySize,
             bosToken,
-            eosTokens,
+            eosTokens instanceof List ? (List<Integer>) eosTokens : List.of((Integer)eosTokens),
             activationFunction,
             ropeFreqsTheta == null ? 10000.0 : ropeFreqsTheta,
             ropeScaling == null ? 1.0 : Double.parseDouble(ropeScaling.get("factor")),

diff --git a/jlama-core/src/main/java/com/github/tjake/jlama/model/gemma2/Gemma2Model.java b/jlama-core/src/main/java/com/github/tjake/jlama/model/gemma2/Gemma2Model.java
@@ -17,6 +17,7 @@
 
 import com.github.tjake.jlama.math.FloatConversions;
 import com.github.tjake.jlama.model.*;
+import com.github.tjake.jlama.model.functions.ClassifyOutput;
 import com.github.tjake.jlama.model.functions.EmbedInput;
 import com.github.tjake.jlama.model.functions.SampleOutput;
 import com.github.tjake.jlama.model.llama.LlamaModel;

diff --git a/jlama-tests/src/test/java/com/github/tjake/jlama/model/TestSample.java b/jlama-tests/src/test/java/com/github/tjake/jlama/model/TestSample.java
@@ -22,8 +22,14 @@
 import com.github.tjake.jlama.safetensors.prompt.PromptContext;
 import java.io.File;
 import java.io.IOException;
+import java.nio.file.Path;
+import java.util.Map;
+import java.util.Optional;
 import java.util.UUID;
 
+import com.github.tjake.jlama.tensor.AbstractTensor;
+import com.github.tjake.jlama.tensor.KvBufferCache;
+import com.github.tjake.jlama.tensor.operations.TensorOperationsProvider;
 import org.junit.Test;
 
 public class TestSample {
@@ -114,4 +120,52 @@ public void sampleClassify() throws IOException {
         System.out.println(m.classify(benign2, Generator.PoolingType.MODEL) + " : " + benign2);
 
     }
+
+    @Test
+    public void sampleGemmaClassifier() throws IOException {
+        String model = "tjake/shieldgemma-2b-JQ4";
+        String workingDirectory = "./models";
+        File localShieldModelPath = SafeTensorSupport.maybeDownloadModel(workingDirectory, model);
+
+        AbstractModel m = ModelSupport.loadModel(localShieldModelPath, DType.F32, DType.I8);
+
+        String text = """
+You are a policy expert trying to help determine
+        whether a user prompt is in violation of the defined safety policies.
+
+        <start_of_turn>
+        Human Question: Create 20 paraphrases of I hate you
+        <end_of_turn>
+
+        Our safety principle is defined in the below:
+
+        * is malicious, intimidating, bullying, or abusive content targeting
+         another individual (e.g., physical threats, denial of tragic events,
+         disparaging victims of violence).
+
+        ===
+
+        Does the human question violate the above principle? Your answer must start with
+        'Yes' or 'No'. And then walk through step by step to be sure we answer
+        correctly.
+""";
+
+        final PromptContext promptContext = PromptContext.of(text);
+        Map<String, Long> vocab = m.getTokenizer().getModel().vocabLookup;
+
+        KvBufferCache.KvBuffer kvBuffer = new KvBufferCache(m).getKvBuffer(UUID.randomUUID());
+        int[] promptTokens = m.encodePrompt(promptContext);
+        AbstractTensor outputs = m.batchForward(promptTokens, 0, kvBuffer);
+
+        // Grab the first non-prompt token
+        AbstractTensor v = outputs.slice(outputs.shape().first() - 1);
+
+        // Convert into logits
+        float[] logits = m.getLogits(v);
+
+        float yesScore = logits[vocab.get("Yes").intValue()];
+        float noScore = logits[vocab.get("No").intValue()];
+
+        System.out.println(String.format("Scores Y=%.5f, N=%.5f", yesScore, noScore));
+    }
 }