diff --git a/jchunk-core/src/main/java/jchunk/chunker/core/chunk/Chunk.java b/jchunk-core/src/main/java/jchunk/chunker/core/chunk/Chunk.java index cfdfc41..3691e1b 100644 --- a/jchunk-core/src/main/java/jchunk/chunker/core/chunk/Chunk.java +++ b/jchunk-core/src/main/java/jchunk/chunker/core/chunk/Chunk.java @@ -10,4 +10,30 @@ * @author Pablo Sanchidrian Herrera */ public record Chunk(Integer id, String content) { + + public static Builder builder() { + return new Builder(); + } + + public static class Builder { + + private Integer id; + + private String content; + + public Builder id(Integer id) { + this.id = id; + return this; + } + + public Builder content(String content) { + this.content = content; + return this; + } + + public Chunk build() { + return new Chunk(id, content); + } + + } } diff --git a/jchunk-recursive-character/pom.xml b/jchunk-recursive-character/pom.xml new file mode 100644 index 0000000..9d7b4ed --- /dev/null +++ b/jchunk-recursive-character/pom.xml @@ -0,0 +1,43 @@ + + + 4.0.0 + + com.github.PabloSanchi + jchunk + 0.0.1-SNAPSHOT + + + jchunk-recursive-character + jar + JChunk - Recursive Character Chunker + Recuersive Character Chunker for Java + https://github.com/PabloSanchi/jchunk + + + https://github.com/PabloSanchi/jchunk + git://github.com/PabloSanchi/jchunk.git + git@github.com:PabloSanchi/jchunk.git + + + + + com.github.PabloSanchi + jchunk-core + ${project.parent.version} + + + + org.springframework.boot + spring-boot + + + + + org.springframework.boot + spring-boot-starter-test + + + + \ No newline at end of file diff --git a/jchunk-recursive-character/src/main/java/jchunk/chunker/recursive/Config.java b/jchunk-recursive-character/src/main/java/jchunk/chunker/recursive/Config.java new file mode 100644 index 0000000..30d390c --- /dev/null +++ b/jchunk-recursive-character/src/main/java/jchunk/chunker/recursive/Config.java @@ -0,0 +1,127 @@ +package jchunk.chunker.recursive; + +import org.springframework.util.Assert; + +import java.util.ArrayList; +import java.util.List; + +/** + * {@link Config} is a class that holds the configuration for the + * {@link RecursiveCharacterChunker}. + * + * @author Pablo Sanchidrian Herrera + */ +public class Config { + + private final Integer chunkSize; + + private final Integer chunkOverlap; + + private final List delimiters; + + private final Delimiter keepDelimiter; + + private final Boolean trimWhitespace; + + public Integer getChunkSize() { + return chunkSize; + } + + public Integer getChunkOverlap() { + return chunkOverlap; + } + + public List getDelimiters() { + return delimiters; + } + + public Delimiter getKeepDelimiter() { + return keepDelimiter; + } + + public Boolean getTrimWhitespace() { + return trimWhitespace; + } + + private Config(Integer chunkSize, Integer chunkOverlap, List delimiters, Delimiter keepDelimiter, + Boolean trimWhitespace) { + this.chunkSize = chunkSize; + this.chunkOverlap = chunkOverlap; + this.delimiters = delimiters; + this.keepDelimiter = keepDelimiter; + this.trimWhitespace = trimWhitespace; + } + + /** + * {@return the default config} + */ + public static Config defaultConfig() { + return builder().build(); + } + + public static Builder builder() { + return new Builder(); + } + + public static class Builder { + + private Integer chunkSize = 100; + + private Integer chunkOverlap = 20; + + private List delimiters = new ArrayList<>(List.of("\n\n", "\n", " ", "")); + + private Delimiter keepDelimiter = Delimiter.START; + + private Boolean trimWhitespace = true; + + public Builder chunkSize(Integer chunkSize) { + Assert.isTrue(chunkSize > 0, "Chunk size must be greater than 0"); + this.chunkSize = chunkSize; + return this; + } + + public Builder chunkOverlap(Integer chunkOverlap) { + Assert.isTrue(chunkOverlap >= 0, "Chunk overlap must be greater than or equal to 0"); + this.chunkOverlap = chunkOverlap; + return this; + } + + public Builder separators(List delimiters) { + this.delimiters = delimiters; + return this; + } + + public Builder keepDelimiter(Delimiter keepDelimiter) { + this.keepDelimiter = keepDelimiter; + return this; + } + + public Builder trimWhitespace(Boolean trimWhitespace) { + this.trimWhitespace = trimWhitespace; + return this; + } + + public Config build() { + Assert.isTrue(chunkSize > chunkOverlap, "Chunk size must be greater than chunk overlap"); + return new Config(chunkSize, chunkOverlap, delimiters, keepDelimiter, trimWhitespace); + } + + } + + /** + * Enum to represent the delimiter configuration + *

+ *

    + *
  • NONE: No delimiter
  • + *
  • START: Delimiter at the start of the chunk
  • + *
  • END: Delimiter at the end of the chunk
  • + *
+ */ + public enum Delimiter { + + NONE, START, END + + } + +} diff --git a/jchunk-recursive-character/src/main/java/jchunk/chunker/recursive/RecursiveCharacterChunker.java b/jchunk-recursive-character/src/main/java/jchunk/chunker/recursive/RecursiveCharacterChunker.java new file mode 100644 index 0000000..4acdd8d --- /dev/null +++ b/jchunk-recursive-character/src/main/java/jchunk/chunker/recursive/RecursiveCharacterChunker.java @@ -0,0 +1,33 @@ +package jchunk.chunker.recursive; + +import jchunk.chunker.core.chunk.Chunk; +import jchunk.chunker.core.chunk.IChunker; + +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +/** + * {@link RecursiveCharacterChunker} is a class that implements the {@link IChunker} + * interface and splits a text into chunks recursively with the given separators. + * + * @author Pablo Sanchidrian Herrera + */ +public class RecursiveCharacterChunker implements IChunker { + + private final Config config; + + public RecursiveCharacterChunker() { + this(Config.defaultConfig()); + } + + public RecursiveCharacterChunker(Config config) { + this.config = config; + } + + @Override + public List split(String content) { + return Utils.splitContent(content, config.getChunkSize(), config.getChunkOverlap(), config.getKeepDelimiter(), + config.getDelimiters(), config.getTrimWhitespace(), new AtomicInteger(0)); + } + +} diff --git a/jchunk-recursive-character/src/main/java/jchunk/chunker/recursive/Utils.java b/jchunk-recursive-character/src/main/java/jchunk/chunker/recursive/Utils.java new file mode 100644 index 0000000..5521ca7 --- /dev/null +++ b/jchunk-recursive-character/src/main/java/jchunk/chunker/recursive/Utils.java @@ -0,0 +1,256 @@ +package jchunk.chunker.recursive; + +import jchunk.chunker.core.chunk.Chunk; + +import java.util.*; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.logging.Logger; +import java.util.regex.Pattern; +import java.util.stream.IntStream; + +/** + * Utility class for recursive chunker. + * + * @author Pablo Sanchidrian Herrera + */ +public class Utils { + + private static final Logger logger = Logger.getLogger(Utils.class.getName()); + + public static final String LONGER_THAN_THE_SPECIFIED = "Created a chunk of size %d, which is longer than the specified %d"; + + /** + * private constructor to hide the implicit public one + */ + private Utils() { + } + + /** + * Splits the content into chunks. + * @param content the content to split + * @param chunkSize the size of the chunk + * @param chunkOverlap the overlap between chunks + * @param keepDelimiter whether to keep the delimiter at the start or end of the + * sentence or not. {@link Config.Delimiter} + * @param delimiters the list of delimiters to split the content + * @param trimWhitespace whether to trim the whitespace + * @param index the index of the chunk + * @return the list of chunks {@link Chunk} + */ + public static List splitContent(String content, Integer chunkSize, Integer chunkOverlap, + Config.Delimiter keepDelimiter, List delimiters, Boolean trimWhitespace, AtomicInteger index) { + + List newDelimiters = new ArrayList<>(delimiters); + String delimiter = getBestMatchingDelimiter(content, newDelimiters); + + List splits = splitWithDelimiter(content, delimiter, keepDelimiter); + + List goodSplits = new ArrayList<>(); + String delimiterToUse = (keepDelimiter != Config.Delimiter.NONE) ? "" : delimiter; + + List chunks = new ArrayList<>(); + + for (String split : splits) { + if (split.length() < chunkSize) { + goodSplits.add(split); + } + else { + if (!goodSplits.isEmpty()) { + List generatedChunks = mergeSentences(goodSplits, delimiterToUse, chunkSize, chunkOverlap, + trimWhitespace, index); + chunks.addAll(generatedChunks); + goodSplits.clear(); + } + + if (newDelimiters.isEmpty()) { + Chunk chunk = Chunk.builder() + .id(index.getAndIncrement()) + .content(trimWhitespace ? split.trim() : split) + .build(); + chunks.add(chunk); + } + else { + List generatedChunks = splitContent(split, chunkSize, chunkOverlap, keepDelimiter, + newDelimiters, trimWhitespace, index); + chunks.addAll(generatedChunks); + } + } + } + + if (!goodSplits.isEmpty()) { + List generatedChunks = mergeSentences(goodSplits, delimiterToUse, chunkSize, chunkOverlap, + trimWhitespace, index); + chunks.addAll(generatedChunks); + } + + return chunks; + } + + /** + * Get the best matching delimiter from right to left in the delimiter list from the + * given config + * @param content the content to split + * @param delimiters the list of delimiters to check + * @return the best matching delimiter and modifies the reference value of the given + * list + */ + private static String getBestMatchingDelimiter(String content, List delimiters) { + for (Iterator iterator = delimiters.iterator(); iterator.hasNext();) { + String delimiter = iterator.next(); + + if (delimiter.isEmpty()) { + delimiters.clear(); + return delimiter; + } + + if (Pattern.compile(delimiter).matcher(content).find()) { + iterator.remove(); + return delimiter; + } + } + + return ""; + } + + /** + * Splits the content into sentences using the delimiter. + * @param content the content to split + * @param delimiter the delimiter to split the content. + * @param keepDelimiter whether to keep the delimiter at the start or end of the + * sentence or not. {@link Config.Delimiter} + * @return a list of split sentences + */ + private static List splitWithDelimiter(String content, String delimiter, Config.Delimiter keepDelimiter) { + if (delimiter.isEmpty()) { + return content.chars().mapToObj(c -> String.valueOf((char) c)).toList(); + } + + String withDelimiter = "((?<=%1$s)|(?=%1$s))"; + List preSplits = new ArrayList<>(List.of(content.split(String.format(withDelimiter, delimiter)))); + + return keepDelimiter == Config.Delimiter.START ? splitWithDelimiterStart(preSplits) + : splitWithDelimiterEnd(preSplits); + } + + /** + * Splits the content into sentences using the delimiter at the start of each + * sentence. {@link Config.Delimiter#START} + * @param preSplits pre-splits by the delimiter + * @return the list of split sentences + */ + private static List splitWithDelimiterStart(List preSplits) { + List splits = new ArrayList<>(); + + splits.add(preSplits.getFirst()); + IntStream.range(1, preSplits.size() - 1) + .filter(i -> i % 2 == 1) + .forEach(i -> splits.add(preSplits.get(i).concat(preSplits.get(i + 1)))); + + return splits.stream().filter(s -> !s.isBlank()).toList(); + } + + /** + * Splits the content into sentences using the delimiter at the end of each sentence. + * {@link Config.Delimiter#END} + * @param preSplits the pre-splits by the delimiter + * @return the list of split sentences + */ + private static List splitWithDelimiterEnd(List preSplits) { + List splits = new ArrayList<>(); + + IntStream.range(0, preSplits.size() - 1) + .filter(i -> i % 2 == 0) + .forEach(i -> splits.add(preSplits.get(i).concat(preSplits.get(i + 1)))); + splits.add(preSplits.getLast()); + + return splits.stream().filter(s -> !s.isBlank()).toList(); + } + + /** + * Merges the sentences into chunks. + * @param sentences the sentences to merge + * @return list of chunks + */ + static List mergeSentences(List sentences, String delimiter, int chunkSize, int chunkOverlap, + boolean trimWhitespace, AtomicInteger index) { + + int currentLen = 0; + int delimiterLen = delimiter.length(); + + List chunks = new ArrayList<>(); + Deque currentChunk = new LinkedList<>(); + + for (String sentence : sentences) { + int sentenceLength = sentence.length(); + + if (currentLen + sentenceLength + (currentChunk.isEmpty() ? 0 : delimiterLen) > chunkSize) { + + if (currentLen > chunkSize) { + logger.warning(String.format(LONGER_THAN_THE_SPECIFIED, currentLen, chunkSize)); + } + + if (!currentChunk.isEmpty()) { + addChunk(chunks, currentChunk, delimiter, trimWhitespace, index); + currentLen = adjustCurrentChunkForOverlap(currentChunk, currentLen, chunkOverlap, delimiterLen); + } + } + + currentChunk.addLast(sentence); + currentLen += sentenceLength + (currentChunk.size() > 1 ? delimiterLen : 0); + } + + if (!currentChunk.isEmpty()) { + addChunk(chunks, currentChunk, delimiter, trimWhitespace, index); + } + + return chunks; + } + + /** + * Adds the chunk to the list of chunks. + * @param chunks the list of chunks + * @param currentChunk the current chunk + * @param delimiter the delimiter + * @param trimWhitespace whether to trim the whitespace + * @param index the index of the chunk + */ + private static void addChunk(List chunks, Deque currentChunk, String delimiter, + boolean trimWhitespace, AtomicInteger index) { + String generatedSentence = joinSentences(new ArrayList<>(currentChunk), delimiter, trimWhitespace); + Chunk chunk = Chunk.builder().id(index.getAndIncrement()).content(generatedSentence).build(); + chunks.add(chunk); + } + + /** + * Adjusts the current chunk for overlap. + * @param currentChunk the current chunk + * @param currentLen the current length of the chunk + * @param chunkOverlap the overlap between chunks + * @param delimiterLen the length of the delimiter + * @return the adjusted length of the chunk + */ + private static int adjustCurrentChunkForOverlap(Deque currentChunk, int currentLen, int chunkOverlap, + int delimiterLen) { + while (currentLen > chunkOverlap && !currentChunk.isEmpty()) { + currentLen -= currentChunk.removeFirst().length() + (currentChunk.isEmpty() ? 0 : delimiterLen); + } + return currentLen; + } + + /** + * Joins the sentences into a single sentence. + * @param sentences the sentences to join + * @param delimiter the delimiter to join the sentences + * @param trimWhitespace whether to trim the whitespace + * @return the generated sentence + */ + private static String joinSentences(List sentences, String delimiter, Boolean trimWhitespace) { + String generatedSentence = String.join(delimiter, sentences); + if (trimWhitespace) { + generatedSentence = generatedSentence.trim(); + } + + return generatedSentence; + } + +} diff --git a/jchunk-recursive-character/src/test/java/jchunk/chunker/recursive/ConfigTest.java b/jchunk-recursive-character/src/test/java/jchunk/chunker/recursive/ConfigTest.java new file mode 100644 index 0000000..1e8271a --- /dev/null +++ b/jchunk-recursive-character/src/test/java/jchunk/chunker/recursive/ConfigTest.java @@ -0,0 +1,65 @@ +package jchunk.chunker.recursive; + +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +class ConfigTest { + + @Test + void testDefaultConfig() { + Config config = Config.defaultConfig(); + + assertThat(config.getChunkSize()).isEqualTo(100); + assertThat(config.getChunkOverlap()).isEqualTo(20); + assertThat(config.getDelimiters()).containsExactly("\n\n", "\n", " ", ""); + assertThat(config.getKeepDelimiter()).isEqualTo(Config.Delimiter.START); + assertThat(config.getTrimWhitespace()).isTrue(); + } + + @Test + void testCustomConfig() { + Config config = Config.builder() + .chunkSize(50) + .chunkOverlap(10) + .separators(List.of("-", "!", "?")) + .keepDelimiter(Config.Delimiter.END) + .trimWhitespace(false) + .build(); + + assertThat(config.getChunkSize()).isEqualTo(50); + assertThat(config.getChunkOverlap()).isEqualTo(10); + assertThat(config.getDelimiters()).containsExactly("-", "!", "?"); + assertThat(config.getKeepDelimiter()).isEqualTo(Config.Delimiter.END); + assertThat(config.getTrimWhitespace()).isFalse(); + } + + @Test + void testThrowExceptionWhenChunkSizeIsZero() { + assertThatThrownBy(() -> Config.builder().chunkSize(0).build()).isInstanceOf(IllegalArgumentException.class) + .hasMessage("Chunk size must be greater than 0"); + } + + @Test + void testThrowExceptionWhenChunkSizeIsNegative() { + assertThatThrownBy(() -> Config.builder().chunkSize(-1).build()).isInstanceOf(IllegalArgumentException.class) + .hasMessage("Chunk size must be greater than 0"); + } + + @Test + void testThrowExceptionWhenChunkOverlapIsNegative() { + assertThatThrownBy(() -> Config.builder().chunkOverlap(-1).build()).isInstanceOf(IllegalArgumentException.class) + .hasMessage("Chunk overlap must be greater than or equal to 0"); + } + + @Test + void testThrowExceptionWhenChunkOverlapIsGreaterThanChunkSize() { + assertThatThrownBy(() -> Config.builder().chunkSize(10).chunkOverlap(20).build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Chunk size must be greater than chunk overlap"); + } + +} diff --git a/jchunk-recursive-character/src/test/java/jchunk/chunker/recursive/RecursiveCharacterChunkerTest.java b/jchunk-recursive-character/src/test/java/jchunk/chunker/recursive/RecursiveCharacterChunkerTest.java new file mode 100644 index 0000000..7ce674e --- /dev/null +++ b/jchunk-recursive-character/src/test/java/jchunk/chunker/recursive/RecursiveCharacterChunkerTest.java @@ -0,0 +1,75 @@ +package jchunk.chunker.recursive; + +import jchunk.chunker.core.chunk.Chunk; +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +class RecursiveCharacterChunkerTest { + + static String content = """ + One of the most important things I didn't understand about the world when I was a child is the degree to which the returns for performance are superlinear. + + Teachers and coaches implicitly told us the returns were linear. "You get out," I heard a thousand times, "what you put in." They meant well, but this is rarely true. If your product is only half as good as your competitor's, you don't get half as many customers. You get no customers, and you go out of business. + + It's obviously true that the returns for performance are superlinear in business. Some think this is a flaw of capitalism, and that if we changed the rules it would stop being true. But superlinear returns for performance are a feature of the world, not an artifact of rules we've invented. We see the same pattern in fame, power, military victories, knowledge, and even benefit to humanity. In all of these, the rich get richer. [1] + """; + + @Test + void testSplit() { + Config config = Config.builder().chunkSize(65).chunkOverlap(0).build(); + RecursiveCharacterChunker chunker = new RecursiveCharacterChunker(config); + + List expectedChunks = List.of( + new Chunk(0, "One of the most important things I didn't understand about the"), + new Chunk(1, "world when I was a child is the degree to which the returns for"), + new Chunk(2, "performance are superlinear."), + new Chunk(3, "Teachers and coaches implicitly told us the returns were linear."), + new Chunk(4, "\"You get out,\" I heard a thousand times, \"what you put in.\" They"), + new Chunk(5, "meant well, but this is rarely true. If your product is only"), + new Chunk(6, "half as good as your competitor's, you don't get half as many"), + new Chunk(7, "customers. You get no customers, and you go out of business."), + new Chunk(8, "It's obviously true that the returns for performance are"), + new Chunk(9, "superlinear in business. Some think this is a flaw of"), + new Chunk(10, "capitalism, and that if we changed the rules it would stop being"), + new Chunk(11, "true. But superlinear returns for performance are a feature of"), + new Chunk(12, "the world, not an artifact of rules we've invented. We see the"), + new Chunk(13, "same pattern in fame, power, military victories, knowledge, and"), + new Chunk(14, "even benefit to humanity. In all of these, the rich get richer."), new Chunk(15, "[1]")); + + List chunks = chunker.split(content); + + assertThat(chunks).isNotNull().hasSize(expectedChunks.size()); + + for (int i = 0; i < chunks.size(); i++) { + assertThat(chunks.get(i).id()).isEqualTo(expectedChunks.get(i).id()); + assertThat(chunks.get(i).content()).isEqualTo(expectedChunks.get(i).content()); + } + } + + @Test + void testSplitWithBigChunkSize() { + Config config = Config.builder().chunkSize(450).chunkOverlap(0).build(); + RecursiveCharacterChunker chunker = new RecursiveCharacterChunker(config); + + List expectedChunks = List.of(new Chunk(0, + "One of the most important things I didn't understand about the world when I was a child is the degree to which the returns for performance are superlinear."), + new Chunk(1, + "Teachers and coaches implicitly told us the returns were linear. \"You get out,\" I heard a thousand times, \"what you put in.\" They meant well, but this is rarely true. If your product is only half as good as your competitor's, you don't get half as many customers. You get no customers, and you go out of business."), + new Chunk(2, + "It's obviously true that the returns for performance are superlinear in business. Some think this is a flaw of capitalism, and that if we changed the rules it would stop being true. But superlinear returns for performance are a feature of the world, not an artifact of rules we've invented. We see the same pattern in fame, power, military victories, knowledge, and even benefit to humanity. In all of these, the rich get richer. [1]")); + + List chunks = chunker.split(content); + + assertThat(chunks).isNotNull().hasSize(expectedChunks.size()); + + for (int i = 0; i < chunks.size(); i++) { + assertThat(chunks.get(i).id()).isEqualTo(expectedChunks.get(i).id()); + assertThat(chunks.get(i).content()).isEqualTo(expectedChunks.get(i).content()); + } + + } + +} diff --git a/jchunk-recursive-character/src/test/java/jchunk/chunker/recursive/UtilsTest.java b/jchunk-recursive-character/src/test/java/jchunk/chunker/recursive/UtilsTest.java new file mode 100644 index 0000000..a74ab7c --- /dev/null +++ b/jchunk-recursive-character/src/test/java/jchunk/chunker/recursive/UtilsTest.java @@ -0,0 +1,28 @@ +package jchunk.chunker.recursive; + +import jchunk.chunker.core.chunk.Chunk; +import org.junit.jupiter.api.Test; + +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +class UtilsTest { + + static String content = """ + This is the first sentence + + Not the first sentence + not the last as well + + finally, the last sentence, wohoo! + """; + + @Test + void splitText() { + Config config = Config.builder().chunkSize(15).build(); + + List sentences = Utils.splitContent(content, config.getChunkSize(), config.getChunkOverlap(), + config.getKeepDelimiter(), config.getDelimiters(), config.getTrimWhitespace(), new AtomicInteger(0)); + } + +} diff --git a/pom.xml b/pom.xml index 86a4ae3..f7b1aa1 100644 --- a/pom.xml +++ b/pom.xml @@ -14,9 +14,10 @@ Enable text chunkers for RAGs in Java - jchunk-semantic jchunk-core jchunk-fixed + jchunk-recursive-character + jchunk-semantic