work on new code-gen - generate SCHEMA$ for enum classes (#310)

linkedin · Apr 13, 2022 · 2c4f0da · 2c4f0da
1 parent a21320d
commit 2c4f0da
Show file tree

Hide file tree

Showing 10 changed files with 213 additions and 110 deletions.
diff --git a/avro-codegen/build.gradle b/avro-codegen/build.gradle
@@ -38,7 +38,7 @@ dependencies {
         exclude group: "com.thoughtworks.paranamer", module: "paranamer-ant"
         exclude group: "org.slf4j"
     }
-    testImplementation 'net.openhft:compiler:2.3.6'
+    testImplementation 'net.openhft:compiler:2.4.1'
 }
 
 jar {

diff --git a/avro-codegen/src/main/java/com/linkedin/avroutil1/codegen/SpecificRecordClassGenerator.java b/avro-codegen/src/main/java/com/linkedin/avroutil1/codegen/SpecificRecordClassGenerator.java
@@ -6,15 +6,22 @@
 
 package com.linkedin.avroutil1.codegen;
 
+import com.linkedin.avroutil1.compatibility.HelperConsts;
+import com.linkedin.avroutil1.compatibility.SourceCodeUtils;
 import com.linkedin.avroutil1.model.AvroEnumSchema;
 import com.linkedin.avroutil1.model.AvroNamedSchema;
 import com.linkedin.avroutil1.model.AvroType;
+import com.linkedin.avroutil1.writer.avsc.AvscSchemaWriter;
 import com.squareup.javapoet.ClassName;
+import com.squareup.javapoet.CodeBlock;
 import com.squareup.javapoet.FieldSpec;
 import com.squareup.javapoet.JavaFile;
 import com.squareup.javapoet.TypeSpec;
 import javax.lang.model.element.Modifier;
 import javax.tools.JavaFileObject;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+import java.util.StringJoiner;
 
 
 /**
@@ -56,13 +63,7 @@ protected JavaFileObject generateSpecificEnum(AvroEnumSchema enumSchema, Specifi
     }
 
     //add public final static SCHEMA$
-    ClassName avroSchemaType = ClassName.get("org.apache.avro", "Schema");
-    classBuilder.alwaysQualify(avroSchemaType.simpleName()); //no import statements
-    classBuilder.addField(FieldSpec
-        .builder(avroSchemaType, "SCHEMA$", Modifier.PUBLIC, Modifier.FINAL, Modifier.STATIC)
-        .initializer("null") //TODO - provide avsc string here
-        .build()
-    );
+    addSchema$ToGeneratedClass(classBuilder, enumSchema);
 
     //create file object
     TypeSpec classSpec = classBuilder.build();
@@ -73,4 +74,52 @@ protected JavaFileObject generateSpecificEnum(AvroEnumSchema enumSchema, Specifi
 
     return javaFile.toJavaFileObject();
   }
+
+  /**
+   * adds "public final static Schema SCHEMA$" field to generated classes for named avro types.
+   * the field is defined as:
+   * public final static Schema SCHEMA$ =
+   *    com.linkedin.avroutil1.compatibility.AvroCompatibilityHelper.parse(avsc1, avsc2, avsc3 ...)
+   * where the arguments are pieces of the input schema's self-contained (fully-inlined) avsc
+   * representation. java does not allow string literals to be &gt; 64K in size, so large avsc literals
+   * are chunked and the var-args Helper.parse() is used.
+   * @param classBuilder builder for a class being generated
+   * @param classSchema schema of the class being generated
+   */
+  protected void addSchema$ToGeneratedClass(TypeSpec.Builder classBuilder, AvroNamedSchema classSchema) {
+    ClassName avroSchemaType = ClassName.get("org.apache.avro", "Schema");
+    classBuilder.alwaysQualify(avroSchemaType.simpleName()); //no import statements
+
+    //get fully-inlined single-line avsc from schema
+    AvscSchemaWriter avscWriter = new AvscSchemaWriter();
+    String avsc = avscWriter.writeSingle(classSchema).getContents();
+
+    //JVM spec spec says string literals cant be over 65535 bytes in size (this isnt simply the
+    //character count as horrible wide unicode characters could be involved).
+    //for details see https://docs.oracle.com/javase/specs/jvms/se8/html/jvms-4.html#jvms-4.4.7
+    //we add some extra safety margin
+    String parseFormat;
+    Object[] parseFormatArgs;
+    if (avsc.getBytes(StandardCharsets.UTF_8).length > 64000) {
+      //not 100% safe as argument is in characters and should be bytes ...
+      List<String> chunks = SourceCodeUtils.safeSplit(avsc, 20000);
+      StringJoiner csv = new StringJoiner(", ");
+      for (int i = 1; i <= chunks.size(); i++) {
+        //"$1S, $2S, ... $NS"
+        csv.add("$" + i + "S");
+      }
+      parseFormat = HelperConsts.HELPER_FQCN + ".parse(" + csv + ")";
+      parseFormatArgs = chunks.toArray(new Object[] {});
+    } else {
+      //no need to split anything
+      parseFormat = HelperConsts.HELPER_FQCN + ".parse($1S)";
+      parseFormatArgs = new Object[] {avsc};
+    }
+    classBuilder.addField(FieldSpec
+            .builder(avroSchemaType, "SCHEMA$", Modifier.PUBLIC, Modifier.FINAL, Modifier.STATIC)
+            //TODO - use strict parsing
+            .initializer(CodeBlock.of(parseFormat, parseFormatArgs))
+            .build()
+    );
+  }
 }
diff --git a/...odegen/src/test/java/com/linkedin/avroutil1/codegen/SpecificRecordClassGeneratorTest.java b/...odegen/src/test/java/com/linkedin/avroutil1/codegen/SpecificRecordClassGeneratorTest.java
@@ -31,4 +31,18 @@ public void testSimpleEnum() throws Exception {
 
     CompilerHelper.assertCompiles(javaSourceFile);
   }
+
+  @Test
+  public void testHugeEnum() throws Exception {
+    String avsc = TestUtil.load("schemas/SimpleEnumWithHugeDoc.avsc");
+    SpecificRecordClassGenerator generator = new SpecificRecordClassGenerator();
+    AvscParser parser = new AvscParser();
+    AvscParseResult result = parser.parse(avsc);
+    Assert.assertNull(result.getParseError());
+    AvroEnumSchema enumSchema = (AvroEnumSchema) result.getTopLevelSchema();
+    Assert.assertNotNull(enumSchema);
+    JavaFileObject javaSourceFile = generator.generateSpecificRecordClass(enumSchema, SpecificRecordGenerationConfig.BROAD_COMPATIBILITY);
+
+    CompilerHelper.assertCompiles(javaSourceFile);
+  }
 }
diff --git a/avro-codegen/src/test/resources/schemas/SimpleEnumWithHugeDoc.avsc b/avro-codegen/src/test/resources/schemas/SimpleEnumWithHugeDoc.avsc
diff --git a/...helper-common/src/main/java/com/linkedin/avroutil1/compatibility/CodeTransformations.java b/...helper-common/src/main/java/com/linkedin/avroutil1/compatibility/CodeTransformations.java
@@ -400,7 +400,7 @@ public static String transformParseCalls(
 
     String argToParseCall;
     if (largeString && !alreadyVararg) {
-      List<String> pieces = safeSplit(stringLiteral, MAX_STRING_LITERAL_SIZE);
+      List<String> pieces = SourceCodeUtils.safeSplit(stringLiteral, MAX_STRING_LITERAL_SIZE);
       StringBuilder argBuilder = new StringBuilder(stringLiteral.length()); //at least
       argBuilder.append("new StringBuilder()");
       for (String piece : pieces) {
@@ -984,53 +984,4 @@ private static String addImports(String code, Collection<String> importStatement
     String newImports = joiner.toString();
     return code.substring(0, endOfImports) + "\n" + newImports + "\n" + code.substring(endOfImports);
   }
-
-  /**
-   * splits a large java string literal into smaller pieces in a safe way.
-   * by safe we mean avoids splitting anywhere near an escape sequence
-   * @param javaStringLiteral large string literal
-   * @return smaller string literals that can be joined to reform the argument
-   */
-  static List<String> safeSplit(String javaStringLiteral, int maxChunkSize) {
-    String remainder = javaStringLiteral;
-    List<String> results = new ArrayList<>(remainder.length() / maxChunkSize);
-    while (remainder.length() > maxChunkSize) {
-      int cutIndex = maxChunkSize;
-      while (cutIndex > 0 && escapesNear(remainder, cutIndex)) {
-        cutIndex--;
-      }
-      if (cutIndex <= 0) {
-        //should never happen ...
-        throw new IllegalStateException("unable to split " + javaStringLiteral);
-      }
-      String piece = remainder.substring(0, cutIndex);
-      results.add(piece);
-      remainder = remainder.substring(cutIndex);
-    }
-    if (!remainder.isEmpty()) {
-      results.add(remainder);
-    }
-    return results;
-  }
-
-  /**
-   * returns true is there's a string escape sequence starting anywhere
-   * near a given index in a given string literal. since the longest escape
-   * sequences in java are ~5-6 characters (unicode escapes) a safety margin
-   * of 10 characters is used.
-   * @param literal string literal to look for escape sequences in
-   * @param index index around (before) which to look for escapes
-   * @return true if any escape sequence found
-   */
-  static boolean escapesNear(String literal, int index) {
-    //we start at index because we dont want the char at the start of the next fragment
-    //to be an "interesting" character either
-    for (int i = index; i > Math.max(0, index - 6); i--) {
-      char c = literal.charAt(i);
-      if (c == '\\' || c == '"' || c == '\'') {
-        return true;
-      }
-    }
-    return false;
-  }
 }
diff --git a/helper/helper-common/src/main/java/com/linkedin/avroutil1/compatibility/SourceCodeUtils.java b/helper/helper-common/src/main/java/com/linkedin/avroutil1/compatibility/SourceCodeUtils.java
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2022 LinkedIn Corp.
+ * Licensed under the BSD 2-Clause License (the "License").
+ * See License in the project root for license information.
+ */
+
+package com.linkedin.avroutil1.compatibility;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class SourceCodeUtils {
+
+    private SourceCodeUtils() {
+        //util class
+    }
+
+    /**
+     * splits a large java string literal into smaller pieces in a safe way.
+     * by safe we mean avoids splitting anywhere near an escape sequence
+     * @param javaStringLiteral large string literal
+     * @param maxChunkSize max chunk size in characters
+     * @return smaller string literals that can be joined to reform the argument
+     * TODO - change this method to calculate chunk sizes in utf-8 bytes
+     */
+    public static List<String> safeSplit(String javaStringLiteral, int maxChunkSize) {
+        String remainder = javaStringLiteral;
+        List<String> results = new ArrayList<>(remainder.length() / maxChunkSize);
+        while (remainder.length() > maxChunkSize) {
+            int cutIndex = maxChunkSize;
+            while (cutIndex > 0 && escapesNear(remainder, cutIndex)) {
+                cutIndex--;
+            }
+            if (cutIndex <= 0) {
+                //should never happen ...
+                throw new IllegalStateException("unable to split " + javaStringLiteral);
+            }
+            String piece = remainder.substring(0, cutIndex);
+            results.add(piece);
+            remainder = remainder.substring(cutIndex);
+        }
+        if (!remainder.isEmpty()) {
+            results.add(remainder);
+        }
+        return results;
+    }
+
+    /**
+     * returns true is there's a string escape sequence starting anywhere
+     * near a given index in a given string literal. since the longest escape
+     * sequences in java are ~5-6 characters (unicode escapes) a safety margin
+     * of 10 characters is used.
+     * @param literal string literal to look for escape sequences in
+     * @param index index around (before) which to look for escapes
+     * @return true if any escape sequence found
+     */
+    static boolean escapesNear(String literal, int index) {
+        //we start at index because we dont want the char at the start of the next fragment
+        //to be an "interesting" character either
+        for (int i = index; i > Math.max(0, index - 6); i--) {
+            char c = literal.charAt(i);
+            if (c == '\\' || c == '"' || c == '\'') {
+                return true;
+            }
+        }
+        return false;
+    }
+}
diff --git a/...er-common/src/test/java/com/linkedin/avroutil1/compatibility/CodeTransformationsTest.java b/...er-common/src/test/java/com/linkedin/avroutil1/compatibility/CodeTransformationsTest.java
@@ -6,62 +6,12 @@
 
 package com.linkedin.avroutil1.compatibility;
 
-import java.util.Arrays;
-import java.util.Collections;
 import org.testng.Assert;
 import org.testng.annotations.Test;
 
 
 public class CodeTransformationsTest {
 
-  @Test
-  public void testSafeSplit() {
-    Assert.assertEquals(
-        Arrays.asList("1234567890", "abcdefghij"),
-        CodeTransformations.safeSplit("1234567890abcdefghij", 10));
-    Assert.assertEquals(
-        Arrays.asList("1234567890", "abcdefghij", "AB"),
-        CodeTransformations.safeSplit("1234567890abcdefghijAB", 10));
-    Assert.assertEquals(Collections.singletonList("1234567890"),
-        CodeTransformations.safeSplit("1234567890", 10));
-    //dont chop at '
-    Assert.assertEquals(
-        Arrays.asList("12345678", "9'abcdefgh", "ij"),
-        CodeTransformations.safeSplit("123456789'abcdefghij", 10));
-    //unicode escapes not on the boundary
-    Assert.assertEquals(
-        Arrays.asList("xx\\u1234xx", "xxxxxxxxxx"),
-        CodeTransformations.safeSplit("xx\\u1234xxxxxxxxxxxx", 10));
-    Assert.assertEquals(
-        Arrays.asList("xxxx\\u1234", "xxxxxxxxxx"),
-        CodeTransformations.safeSplit("xxxx\\u1234xxxxxxxxxx", 10));
-    //unicode escapes cross the boundary
-    Assert.assertEquals(
-        Arrays.asList("xxxx","x\\u1234xxx", "xxxxxx"),
-        CodeTransformations.safeSplit("xxxxx\\u1234xxxxxxxxx", 10));
-    Assert.assertEquals(
-        Arrays.asList("xxxxx","x\\u1234xxx", "xxxxx"),
-        CodeTransformations.safeSplit("xxxxxx\\u1234xxxxxxxx", 10));
-    Assert.assertEquals(
-        Arrays.asList("xxxxxx","x\\u1234xxx", "xxxx"),
-        CodeTransformations.safeSplit("xxxxxxx\\u1234xxxxxxx", 10));
-    Assert.assertEquals(
-        Arrays.asList("xxxxxxx","x\\u1234xxx", "xxx"),
-        CodeTransformations.safeSplit("xxxxxxxx\\u1234xxxxxx", 10));
-    Assert.assertEquals(
-        Arrays.asList("xxxxxxxx","x\\u1234xxx", "xx"),
-        CodeTransformations.safeSplit("xxxxxxxxx\\u1234xxxxx", 10));
-    Assert.assertEquals(
-        Arrays.asList("xxxxxxxxx","x\\u1234xxx", "x"),
-        CodeTransformations.safeSplit("xxxxxxxxxx\\u1234xxxx", 10));
-    Assert.assertEquals(
-        Arrays.asList("xxxxxxxxx","x\\u1234xxx", "x"),
-        CodeTransformations.safeSplit("xxxxxxxxxx\\u1234xxxx", 10));
-    Assert.assertEquals(
-        Arrays.asList("xxxxxxxxxx","x\\u1234xxx"),
-        CodeTransformations.safeSplit("xxxxxxxxxxx\\u1234xxx", 10));
-  }
-
   @Test
   public void testFindEndOfSchemaDeclaration() {
     String normal = "public static final org.apache.avro.Schema SCHEMA$ = whatever(\"{json}\"); fluff";

diff --git a/...helper-common/src/test/java/com/linkedin/avroutil1/compatibility/SourceCodeUtilsTest.java b/...helper-common/src/test/java/com/linkedin/avroutil1/compatibility/SourceCodeUtilsTest.java
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2022 LinkedIn Corp.
+ * Licensed under the BSD 2-Clause License (the "License").
+ * See License in the project root for license information.
+ */
+
+package com.linkedin.avroutil1.compatibility;
+
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+import java.util.Arrays;
+import java.util.Collections;
+
+public class SourceCodeUtilsTest {
+
+    @Test
+    public void testSafeSplit() {
+        Assert.assertEquals(
+                Arrays.asList("1234567890", "abcdefghij"),
+                SourceCodeUtils.safeSplit("1234567890abcdefghij", 10));
+        Assert.assertEquals(
+                Arrays.asList("1234567890", "abcdefghij", "AB"),
+                SourceCodeUtils.safeSplit("1234567890abcdefghijAB", 10));
+        Assert.assertEquals(Collections.singletonList("1234567890"),
+                SourceCodeUtils.safeSplit("1234567890", 10));
+        //dont chop at '
+        Assert.assertEquals(
+                Arrays.asList("12345678", "9'abcdefgh", "ij"),
+                SourceCodeUtils.safeSplit("123456789'abcdefghij", 10));
+        //unicode escapes not on the boundary
+        Assert.assertEquals(
+                Arrays.asList("xx\\u1234xx", "xxxxxxxxxx"),
+                SourceCodeUtils.safeSplit("xx\\u1234xxxxxxxxxxxx", 10));
+        Assert.assertEquals(
+                Arrays.asList("xxxx\\u1234", "xxxxxxxxxx"),
+                SourceCodeUtils.safeSplit("xxxx\\u1234xxxxxxxxxx", 10));
+        //unicode escapes cross the boundary
+        Assert.assertEquals(
+                Arrays.asList("xxxx","x\\u1234xxx", "xxxxxx"),
+                SourceCodeUtils.safeSplit("xxxxx\\u1234xxxxxxxxx", 10));
+        Assert.assertEquals(
+                Arrays.asList("xxxxx","x\\u1234xxx", "xxxxx"),
+                SourceCodeUtils.safeSplit("xxxxxx\\u1234xxxxxxxx", 10));
+        Assert.assertEquals(
+                Arrays.asList("xxxxxx","x\\u1234xxx", "xxxx"),
+                SourceCodeUtils.safeSplit("xxxxxxx\\u1234xxxxxxx", 10));
+        Assert.assertEquals(
+                Arrays.asList("xxxxxxx","x\\u1234xxx", "xxx"),
+                SourceCodeUtils.safeSplit("xxxxxxxx\\u1234xxxxxx", 10));
+        Assert.assertEquals(
+                Arrays.asList("xxxxxxxx","x\\u1234xxx", "xx"),
+                SourceCodeUtils.safeSplit("xxxxxxxxx\\u1234xxxxx", 10));
+        Assert.assertEquals(
+                Arrays.asList("xxxxxxxxx","x\\u1234xxx", "x"),
+                SourceCodeUtils.safeSplit("xxxxxxxxxx\\u1234xxxx", 10));
+        Assert.assertEquals(
+                Arrays.asList("xxxxxxxxx","x\\u1234xxx", "x"),
+                SourceCodeUtils.safeSplit("xxxxxxxxxx\\u1234xxxx", 10));
+        Assert.assertEquals(
+                Arrays.asList("xxxxxxxxxx","x\\u1234xxx"),
+                SourceCodeUtils.safeSplit("xxxxxxxxxxx\\u1234xxx", 10));
+    }
+}
diff --git a/helper/tests/helper-tests-common/build.gradle b/helper/tests/helper-tests-common/build.gradle
@@ -14,7 +14,7 @@ dependencies {
   api project(":helper:helper")
   //we use this module as an easy way to "export" libraries for use by other test modules
   api "commons-io:commons-io:2.6"
-  api "net.openhft:compiler:2.3.6"
+  api "net.openhft:compiler:2.4.1"
 
   implementation "args4j:args4j:2.33"
 

diff --git a/test-common/build.gradle b/test-common/build.gradle
@@ -12,6 +12,6 @@ plugins {
 dependencies {
   //we use this module as an easy way to "export" libraries for use by other test modules
   api "commons-io:commons-io:2.6"
-  api "net.openhft:compiler:2.3.6"
+  api "net.openhft:compiler:2.4.1"
   api "org.testng:testng:6.14.3"
 }