Skip to content

Commit

Permalink
work on new code-gen - generate SCHEMA$ for enum classes (#310)
Browse files Browse the repository at this point in the history
  • Loading branch information
radai-rosenblatt authored Apr 13, 2022
1 parent a21320d commit 2c4f0da
Show file tree
Hide file tree
Showing 10 changed files with 213 additions and 110 deletions.
2 changes: 1 addition & 1 deletion avro-codegen/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ dependencies {
exclude group: "com.thoughtworks.paranamer", module: "paranamer-ant"
exclude group: "org.slf4j"
}
testImplementation 'net.openhft:compiler:2.3.6'
testImplementation 'net.openhft:compiler:2.4.1'
}

jar {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,22 @@

package com.linkedin.avroutil1.codegen;

import com.linkedin.avroutil1.compatibility.HelperConsts;
import com.linkedin.avroutil1.compatibility.SourceCodeUtils;
import com.linkedin.avroutil1.model.AvroEnumSchema;
import com.linkedin.avroutil1.model.AvroNamedSchema;
import com.linkedin.avroutil1.model.AvroType;
import com.linkedin.avroutil1.writer.avsc.AvscSchemaWriter;
import com.squareup.javapoet.ClassName;
import com.squareup.javapoet.CodeBlock;
import com.squareup.javapoet.FieldSpec;
import com.squareup.javapoet.JavaFile;
import com.squareup.javapoet.TypeSpec;
import javax.lang.model.element.Modifier;
import javax.tools.JavaFileObject;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.StringJoiner;


/**
Expand Down Expand Up @@ -56,13 +63,7 @@ protected JavaFileObject generateSpecificEnum(AvroEnumSchema enumSchema, Specifi
}

//add public final static SCHEMA$
ClassName avroSchemaType = ClassName.get("org.apache.avro", "Schema");
classBuilder.alwaysQualify(avroSchemaType.simpleName()); //no import statements
classBuilder.addField(FieldSpec
.builder(avroSchemaType, "SCHEMA$", Modifier.PUBLIC, Modifier.FINAL, Modifier.STATIC)
.initializer("null") //TODO - provide avsc string here
.build()
);
addSchema$ToGeneratedClass(classBuilder, enumSchema);

//create file object
TypeSpec classSpec = classBuilder.build();
Expand All @@ -73,4 +74,52 @@ protected JavaFileObject generateSpecificEnum(AvroEnumSchema enumSchema, Specifi

return javaFile.toJavaFileObject();
}

/**
* adds "public final static Schema SCHEMA$" field to generated classes for named avro types.
* the field is defined as:
* public final static Schema SCHEMA$ =
* com.linkedin.avroutil1.compatibility.AvroCompatibilityHelper.parse(avsc1, avsc2, avsc3 ...)
* where the arguments are pieces of the input schema's self-contained (fully-inlined) avsc
* representation. java does not allow string literals to be > 64K in size, so large avsc literals
* are chunked and the var-args Helper.parse() is used.
* @param classBuilder builder for a class being generated
* @param classSchema schema of the class being generated
*/
protected void addSchema$ToGeneratedClass(TypeSpec.Builder classBuilder, AvroNamedSchema classSchema) {
ClassName avroSchemaType = ClassName.get("org.apache.avro", "Schema");
classBuilder.alwaysQualify(avroSchemaType.simpleName()); //no import statements

//get fully-inlined single-line avsc from schema
AvscSchemaWriter avscWriter = new AvscSchemaWriter();
String avsc = avscWriter.writeSingle(classSchema).getContents();

//JVM spec spec says string literals cant be over 65535 bytes in size (this isnt simply the
//character count as horrible wide unicode characters could be involved).
//for details see https://docs.oracle.com/javase/specs/jvms/se8/html/jvms-4.html#jvms-4.4.7
//we add some extra safety margin
String parseFormat;
Object[] parseFormatArgs;
if (avsc.getBytes(StandardCharsets.UTF_8).length > 64000) {
//not 100% safe as argument is in characters and should be bytes ...
List<String> chunks = SourceCodeUtils.safeSplit(avsc, 20000);
StringJoiner csv = new StringJoiner(", ");
for (int i = 1; i <= chunks.size(); i++) {
//"$1S, $2S, ... $NS"
csv.add("$" + i + "S");
}
parseFormat = HelperConsts.HELPER_FQCN + ".parse(" + csv + ")";
parseFormatArgs = chunks.toArray(new Object[] {});
} else {
//no need to split anything
parseFormat = HelperConsts.HELPER_FQCN + ".parse($1S)";
parseFormatArgs = new Object[] {avsc};
}
classBuilder.addField(FieldSpec
.builder(avroSchemaType, "SCHEMA$", Modifier.PUBLIC, Modifier.FINAL, Modifier.STATIC)
//TODO - use strict parsing
.initializer(CodeBlock.of(parseFormat, parseFormatArgs))
.build()
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,18 @@ public void testSimpleEnum() throws Exception {

CompilerHelper.assertCompiles(javaSourceFile);
}

@Test
public void testHugeEnum() throws Exception {
String avsc = TestUtil.load("schemas/SimpleEnumWithHugeDoc.avsc");
SpecificRecordClassGenerator generator = new SpecificRecordClassGenerator();
AvscParser parser = new AvscParser();
AvscParseResult result = parser.parse(avsc);
Assert.assertNull(result.getParseError());
AvroEnumSchema enumSchema = (AvroEnumSchema) result.getTopLevelSchema();
Assert.assertNotNull(enumSchema);
JavaFileObject javaSourceFile = generator.generateSpecificRecordClass(enumSchema, SpecificRecordGenerationConfig.BROAD_COMPATIBILITY);

CompilerHelper.assertCompiles(javaSourceFile);
}
}

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ public static String transformParseCalls(

String argToParseCall;
if (largeString && !alreadyVararg) {
List<String> pieces = safeSplit(stringLiteral, MAX_STRING_LITERAL_SIZE);
List<String> pieces = SourceCodeUtils.safeSplit(stringLiteral, MAX_STRING_LITERAL_SIZE);
StringBuilder argBuilder = new StringBuilder(stringLiteral.length()); //at least
argBuilder.append("new StringBuilder()");
for (String piece : pieces) {
Expand Down Expand Up @@ -984,53 +984,4 @@ private static String addImports(String code, Collection<String> importStatement
String newImports = joiner.toString();
return code.substring(0, endOfImports) + "\n" + newImports + "\n" + code.substring(endOfImports);
}

/**
* splits a large java string literal into smaller pieces in a safe way.
* by safe we mean avoids splitting anywhere near an escape sequence
* @param javaStringLiteral large string literal
* @return smaller string literals that can be joined to reform the argument
*/
static List<String> safeSplit(String javaStringLiteral, int maxChunkSize) {
String remainder = javaStringLiteral;
List<String> results = new ArrayList<>(remainder.length() / maxChunkSize);
while (remainder.length() > maxChunkSize) {
int cutIndex = maxChunkSize;
while (cutIndex > 0 && escapesNear(remainder, cutIndex)) {
cutIndex--;
}
if (cutIndex <= 0) {
//should never happen ...
throw new IllegalStateException("unable to split " + javaStringLiteral);
}
String piece = remainder.substring(0, cutIndex);
results.add(piece);
remainder = remainder.substring(cutIndex);
}
if (!remainder.isEmpty()) {
results.add(remainder);
}
return results;
}

/**
* returns true is there's a string escape sequence starting anywhere
* near a given index in a given string literal. since the longest escape
* sequences in java are ~5-6 characters (unicode escapes) a safety margin
* of 10 characters is used.
* @param literal string literal to look for escape sequences in
* @param index index around (before) which to look for escapes
* @return true if any escape sequence found
*/
static boolean escapesNear(String literal, int index) {
//we start at index because we dont want the char at the start of the next fragment
//to be an "interesting" character either
for (int i = index; i > Math.max(0, index - 6); i--) {
char c = literal.charAt(i);
if (c == '\\' || c == '"' || c == '\'') {
return true;
}
}
return false;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/*
* Copyright 2022 LinkedIn Corp.
* Licensed under the BSD 2-Clause License (the "License").
* See License in the project root for license information.
*/

package com.linkedin.avroutil1.compatibility;

import java.util.ArrayList;
import java.util.List;

public class SourceCodeUtils {

private SourceCodeUtils() {
//util class
}

/**
* splits a large java string literal into smaller pieces in a safe way.
* by safe we mean avoids splitting anywhere near an escape sequence
* @param javaStringLiteral large string literal
* @param maxChunkSize max chunk size in characters
* @return smaller string literals that can be joined to reform the argument
* TODO - change this method to calculate chunk sizes in utf-8 bytes
*/
public static List<String> safeSplit(String javaStringLiteral, int maxChunkSize) {
String remainder = javaStringLiteral;
List<String> results = new ArrayList<>(remainder.length() / maxChunkSize);
while (remainder.length() > maxChunkSize) {
int cutIndex = maxChunkSize;
while (cutIndex > 0 && escapesNear(remainder, cutIndex)) {
cutIndex--;
}
if (cutIndex <= 0) {
//should never happen ...
throw new IllegalStateException("unable to split " + javaStringLiteral);
}
String piece = remainder.substring(0, cutIndex);
results.add(piece);
remainder = remainder.substring(cutIndex);
}
if (!remainder.isEmpty()) {
results.add(remainder);
}
return results;
}

/**
* returns true is there's a string escape sequence starting anywhere
* near a given index in a given string literal. since the longest escape
* sequences in java are ~5-6 characters (unicode escapes) a safety margin
* of 10 characters is used.
* @param literal string literal to look for escape sequences in
* @param index index around (before) which to look for escapes
* @return true if any escape sequence found
*/
static boolean escapesNear(String literal, int index) {
//we start at index because we dont want the char at the start of the next fragment
//to be an "interesting" character either
for (int i = index; i > Math.max(0, index - 6); i--) {
char c = literal.charAt(i);
if (c == '\\' || c == '"' || c == '\'') {
return true;
}
}
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,62 +6,12 @@

package com.linkedin.avroutil1.compatibility;

import java.util.Arrays;
import java.util.Collections;
import org.testng.Assert;
import org.testng.annotations.Test;


public class CodeTransformationsTest {

@Test
public void testSafeSplit() {
Assert.assertEquals(
Arrays.asList("1234567890", "abcdefghij"),
CodeTransformations.safeSplit("1234567890abcdefghij", 10));
Assert.assertEquals(
Arrays.asList("1234567890", "abcdefghij", "AB"),
CodeTransformations.safeSplit("1234567890abcdefghijAB", 10));
Assert.assertEquals(Collections.singletonList("1234567890"),
CodeTransformations.safeSplit("1234567890", 10));
//dont chop at '
Assert.assertEquals(
Arrays.asList("12345678", "9'abcdefgh", "ij"),
CodeTransformations.safeSplit("123456789'abcdefghij", 10));
//unicode escapes not on the boundary
Assert.assertEquals(
Arrays.asList("xx\\u1234xx", "xxxxxxxxxx"),
CodeTransformations.safeSplit("xx\\u1234xxxxxxxxxxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxx\\u1234", "xxxxxxxxxx"),
CodeTransformations.safeSplit("xxxx\\u1234xxxxxxxxxx", 10));
//unicode escapes cross the boundary
Assert.assertEquals(
Arrays.asList("xxxx","x\\u1234xxx", "xxxxxx"),
CodeTransformations.safeSplit("xxxxx\\u1234xxxxxxxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxxx","x\\u1234xxx", "xxxxx"),
CodeTransformations.safeSplit("xxxxxx\\u1234xxxxxxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxxxx","x\\u1234xxx", "xxxx"),
CodeTransformations.safeSplit("xxxxxxx\\u1234xxxxxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxxxxx","x\\u1234xxx", "xxx"),
CodeTransformations.safeSplit("xxxxxxxx\\u1234xxxxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxxxxxx","x\\u1234xxx", "xx"),
CodeTransformations.safeSplit("xxxxxxxxx\\u1234xxxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxxxxxxx","x\\u1234xxx", "x"),
CodeTransformations.safeSplit("xxxxxxxxxx\\u1234xxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxxxxxxx","x\\u1234xxx", "x"),
CodeTransformations.safeSplit("xxxxxxxxxx\\u1234xxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxxxxxxxx","x\\u1234xxx"),
CodeTransformations.safeSplit("xxxxxxxxxxx\\u1234xxx", 10));
}

@Test
public void testFindEndOfSchemaDeclaration() {
String normal = "public static final org.apache.avro.Schema SCHEMA$ = whatever(\"{json}\"); fluff";
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Copyright 2022 LinkedIn Corp.
* Licensed under the BSD 2-Clause License (the "License").
* See License in the project root for license information.
*/

package com.linkedin.avroutil1.compatibility;

import org.testng.Assert;
import org.testng.annotations.Test;

import java.util.Arrays;
import java.util.Collections;

public class SourceCodeUtilsTest {

@Test
public void testSafeSplit() {
Assert.assertEquals(
Arrays.asList("1234567890", "abcdefghij"),
SourceCodeUtils.safeSplit("1234567890abcdefghij", 10));
Assert.assertEquals(
Arrays.asList("1234567890", "abcdefghij", "AB"),
SourceCodeUtils.safeSplit("1234567890abcdefghijAB", 10));
Assert.assertEquals(Collections.singletonList("1234567890"),
SourceCodeUtils.safeSplit("1234567890", 10));
//dont chop at '
Assert.assertEquals(
Arrays.asList("12345678", "9'abcdefgh", "ij"),
SourceCodeUtils.safeSplit("123456789'abcdefghij", 10));
//unicode escapes not on the boundary
Assert.assertEquals(
Arrays.asList("xx\\u1234xx", "xxxxxxxxxx"),
SourceCodeUtils.safeSplit("xx\\u1234xxxxxxxxxxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxx\\u1234", "xxxxxxxxxx"),
SourceCodeUtils.safeSplit("xxxx\\u1234xxxxxxxxxx", 10));
//unicode escapes cross the boundary
Assert.assertEquals(
Arrays.asList("xxxx","x\\u1234xxx", "xxxxxx"),
SourceCodeUtils.safeSplit("xxxxx\\u1234xxxxxxxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxxx","x\\u1234xxx", "xxxxx"),
SourceCodeUtils.safeSplit("xxxxxx\\u1234xxxxxxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxxxx","x\\u1234xxx", "xxxx"),
SourceCodeUtils.safeSplit("xxxxxxx\\u1234xxxxxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxxxxx","x\\u1234xxx", "xxx"),
SourceCodeUtils.safeSplit("xxxxxxxx\\u1234xxxxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxxxxxx","x\\u1234xxx", "xx"),
SourceCodeUtils.safeSplit("xxxxxxxxx\\u1234xxxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxxxxxxx","x\\u1234xxx", "x"),
SourceCodeUtils.safeSplit("xxxxxxxxxx\\u1234xxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxxxxxxx","x\\u1234xxx", "x"),
SourceCodeUtils.safeSplit("xxxxxxxxxx\\u1234xxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxxxxxxxx","x\\u1234xxx"),
SourceCodeUtils.safeSplit("xxxxxxxxxxx\\u1234xxx", 10));
}
}
2 changes: 1 addition & 1 deletion helper/tests/helper-tests-common/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ dependencies {
api project(":helper:helper")
//we use this module as an easy way to "export" libraries for use by other test modules
api "commons-io:commons-io:2.6"
api "net.openhft:compiler:2.3.6"
api "net.openhft:compiler:2.4.1"

implementation "args4j:args4j:2.33"

Expand Down
2 changes: 1 addition & 1 deletion test-common/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ plugins {
dependencies {
//we use this module as an easy way to "export" libraries for use by other test modules
api "commons-io:commons-io:2.6"
api "net.openhft:compiler:2.3.6"
api "net.openhft:compiler:2.4.1"
api "org.testng:testng:6.14.3"
}

0 comments on commit 2c4f0da

Please sign in to comment.