diff --git a/README.md b/README.md index bae1a39..5f69957 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ # JChunk +## A Spring Boot Library for Text Chunking JChunk project is simple library that enables different types of text splitting strategies. This project begun thanks to Greg Kamradt's post [text splitting ideas](https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb) @@ -10,14 +11,76 @@ For now there is only [Pablo Sanchidrian](https://github.com/PabloSanchi) develo Feel free to contribute!! ## ROAD MAP -- [ ] Character Chunker (NOT STARTED) +- [x] Fixed Character Chunker (PRE-RELEASE) - [ ] Recursive Character Text Chunker (NOT STARTED) - [ ] Document Specific Chunker (NOT STARTED) - [x] Semantic Chunker (PRE-RELEASE) - [ ] Agentic Chunker (NOT STARTED) +## Building + +To build with running unit tests + +```sh +./mvnw clean package +``` + +To reformat using the java-format plugin + +```sh +./mvnw spring-javaformat:apply +``` + +To update the year on license headers using the license-maven-plugin + +```sh +./mvnw license:update-file-header -Plicense +``` + +To check javadocs using the javadoc:javadoc + +```sh +./mvnw javadoc:javadoc -Pjavadoc +``` + +## Fixed Character Chunker +Character splitting is a basic text processing technique where text is divided into fixed-size chunks of characters. While it's not suitable for most advanced text processing tasks due to its simplicity and rigidity, it serves as an excellent starting point to understand the fundamentals of text splitting. See the following aspects of this chunker including its advantages, disadvantages, and key concepts like chunk size, chunk overlap, and separators. + +### 1. Chunk Size +The chunk size is the number of characters each chunk will contain. For example, if you set a chunk size of 50, each chunk will consist of 50 characters. + +**Example:** +- Input Text: "This is an example of character splitting." +- Chunk Size: 10 +- Output Chunks: `["This is an", " example o", "f characte", "r splitti", "ng."]` + +### 2. Chunk Overlap +Chunk overlap refers to the number of characters that will overlap between consecutive chunks. This helps in maintaining context across chunks by ensuring that a portion of the text at the end of one chunk is repeated at the beginning of the next chunk. + +**Example:** +- Input Text: "This is an example of character splitting." +- Chunk Size: 10 +- Chunk Overlap: 4 +- Output Chunks: `["This is an", " an examp", "mple of ch", "aracter sp", " splitting."]` + +### 3. Separators +Separators are specific character sequences used to split the text. For instance, you might want to split your text at every comma or period. -## Character Chunker +**Example:** +- Input Text: "This is an example. Let's split on periods. Okay?" +- Separator: ". " +- Output Chunks: ["This is an example", "Let's split on periods", "Okay?"] + + +### Pros and Cons + +**Pros** +- Easy & Simple: Character splitting is straightforward to implement and understand. +- Basic Segmentation: It provides a basic way to segment text into smaller pieces. + +**Cons** +- Rigid: Does not consider the structure or context of the text. +- Duplicate Data: Chunk overlap creates duplicate data, which might not be efficient. ## Recursive Character Text Chunker @@ -117,32 +180,6 @@ Split the text into chunks at the identified breakpoints. ## Agentic Chunker -## Building - -To build with running unit tests - -```sh -./mvnw clean package -``` - -To reformat using the java-format plugin - -```sh -./mvnw spring-javaformat:apply -``` - -To update the year on license headers using the license-maven-plugin - -```sh -./mvnw license:update-file-header -Plicense -``` - -To check javadocs using the javadoc:javadoc - -```sh -./mvnw javadoc:javadoc -Pjavadoc -``` - ## Contributing Please read [CONTRIBUTING.md](CONTRIBUTING.md) for details on our code of conduct, and the process for submitting pull requests to us. diff --git a/jchunk-fixed/pom.xml b/jchunk-fixed/pom.xml new file mode 100644 index 0000000..bd48d17 --- /dev/null +++ b/jchunk-fixed/pom.xml @@ -0,0 +1,42 @@ + + + 4.0.0 + + com.github.PabloSanchi + jchunk + 0.0.1-SNAPSHOT + + + jchunk-fixed + jar + JChunk - Fixed Chunker + Fixed Chunker for Java + https://github.com/PabloSanchi/jchunk + + + https://github.com/PabloSanchi/jchunk + git://github.com/PabloSanchi/jchunk.git + git@github.com:PabloSanchi/jchunk.git + + + + + com.github.PabloSanchi + jchunk-core + ${project.parent.version} + + + org.springframework.boot + spring-boot + + + + + org.springframework.boot + spring-boot-starter-test + + + + \ No newline at end of file diff --git a/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Config.java b/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Config.java new file mode 100644 index 0000000..255d1cc --- /dev/null +++ b/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Config.java @@ -0,0 +1,118 @@ +package jchunk.chunker.fixed; + +import org.springframework.util.Assert; + +/** + * Configuration for the fixed chunker + * + * @author Pablo Sanchidrian Herrera + */ +public class Config { + + private final Integer chunkSize; + + private final Integer chunkOverlap; + + private final String delimiter; + + private final Boolean trimWhitespace; + + private final Delimiter keepDelimiter; + + public Integer getChunkSize() { + return chunkSize; + } + + public Integer getChunkOverlap() { + return chunkOverlap; + } + + public String getDelimiter() { + return delimiter; + } + + public Boolean getTrimWhitespace() { + return trimWhitespace; + } + + public Delimiter getKeepDelimiter() { + return keepDelimiter; + } + + public Config(Integer chunkSize, Integer chunkOverlap, String delimiter, Boolean trimWhitespace, + Delimiter keepDelimiter) { + this.chunkSize = chunkSize; + this.chunkOverlap = chunkOverlap; + this.delimiter = delimiter; + this.trimWhitespace = trimWhitespace; + this.keepDelimiter = keepDelimiter; + } + + /** + * {@return the default config} + */ + public static Config defaultConfig() { + return builder().build(); + } + + public static Builder builder() { + return new Builder(); + } + + public static class Builder { + + private Integer chunkSize = 1000; + + private Integer chunkOverlap = 100; + + private String delimiter = " "; + + private Boolean trimWhitespace = true; + + private Delimiter keepDelimiter = Delimiter.NONE; + + public Builder chunkSize(Integer chunkSize) { + Assert.isTrue(chunkSize > 0, "Chunk size must be greater than 0"); + this.chunkSize = chunkSize; + return this; + } + + public Builder chunkOverlap(Integer chunkOverlap) { + Assert.isTrue(chunkOverlap >= 0, "Chunk overlap must be greater than or equal to 0"); + this.chunkOverlap = chunkOverlap; + return this; + } + + public Builder delimiter(String delimiter) { + this.delimiter = delimiter; + return this; + } + + public Builder trimWhitespace(Boolean trimWhitespace) { + this.trimWhitespace = trimWhitespace; + return this; + } + + public Builder keepDelimiter(Delimiter keepDelimiter) { + this.keepDelimiter = keepDelimiter; + return this; + } + + public Config build() { + Assert.isTrue(chunkSize > chunkOverlap, "Chunk size must be greater than chunk overlap"); + return new Config(chunkSize, chunkOverlap, delimiter, trimWhitespace, keepDelimiter); + } + + } + + /** + * Enum to represent the delimiter configuration NONE: No delimiter START: Delimiter + * at the start of the chunk END: Delimiter at the end of the chunk + */ + public enum Delimiter { + + NONE, START, END + + } + +} diff --git a/jchunk-fixed/src/main/java/jchunk/chunker/fixed/FixedChunker.java b/jchunk-fixed/src/main/java/jchunk/chunker/fixed/FixedChunker.java new file mode 100644 index 0000000..e4a9186 --- /dev/null +++ b/jchunk-fixed/src/main/java/jchunk/chunker/fixed/FixedChunker.java @@ -0,0 +1,31 @@ +package jchunk.chunker.fixed; + +import jchunk.chunker.core.chunk.Chunk; +import jchunk.chunker.core.chunk.IChunker; + +import java.util.List; + +/** + * {@link FixedChunker} is a chunker that splits the content into fixed size chunks. + * + * @author Pablo Sanchidrian Herrera + */ +public class FixedChunker implements IChunker { + + private final Config config; + + public FixedChunker() { + this(Config.defaultConfig()); + } + + public FixedChunker(Config config) { + this.config = config; + } + + @Override + public List split(String content) { + List sentences = Utils.splitIntoSentences(content, config); + return Utils.mergeSentences(sentences, config); + } + +} diff --git a/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Utils.java b/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Utils.java new file mode 100644 index 0000000..78c0e96 --- /dev/null +++ b/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Utils.java @@ -0,0 +1,164 @@ +package jchunk.chunker.fixed; + +import jchunk.chunker.core.chunk.Chunk; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.logging.Logger; +import java.util.stream.IntStream; + +public class Utils { + + /** + * private constructor to hide the implicit public one + */ + private Utils() { + } + + private static final Logger logger = Logger.getLogger(Utils.class.getName()); + + public static final String LONGER_THAN_THE_SPECIFIED = "Created a chunk of size %d, which is longer than the specified %d"; + + /** + * Splits the content into sentences using the delimiter. + * @param content the content to split + * @param config configuration for the chunker/splitter + * @return a list of split sentences + */ + public static List splitIntoSentences(String content, Config config) { + String delimiter = config.getDelimiter(); + Config.Delimiter keepDelimiter = config.getKeepDelimiter(); + + if (delimiter.isBlank()) { + return content.chars().mapToObj(c -> String.valueOf((char) c)).toList(); + } + + return splitWithDelimiter(content, delimiter, keepDelimiter); + } + + /** + * Splits the content into sentences using the delimiter. + * @param content the content to split + * @param delimiter the delimiter to split the content. + * @param keepDelimiter whether to keep the delimiter at the start or end of the + * sentence or not. {@link Config.Delimiter} + * @return a list of split sentences + */ + private static List splitWithDelimiter(String content, String delimiter, Config.Delimiter keepDelimiter) { + + if (keepDelimiter == Config.Delimiter.NONE) { + return Arrays.stream(content.split(delimiter)).filter(s -> !s.isBlank()).toList(); + } + + String withDelimiter = "((?<=%1$s)|(?=%1$s))"; + List preSplits = new ArrayList<>(List.of(content.split(String.format(withDelimiter, delimiter)))); + + return keepDelimiter == Config.Delimiter.START ? splitWithDelimiterStart(preSplits) + : splitWithDelimiterEnd(preSplits); + } + + /** + * Splits the content into sentences using the delimiter at the start of each + * sentence. {@link Config.Delimiter#START} + * @param preSplits pre-splits by the delimiter + * @return the list of split sentences + */ + private static List splitWithDelimiterStart(List preSplits) { + List splits = new ArrayList<>(); + + splits.add(preSplits.getFirst()); + IntStream.range(1, preSplits.size()) + .filter(i -> i % 2 == 1) + .forEach(i -> splits.add(preSplits.get(i).concat(preSplits.get(i + 1)))); + + return splits.stream().filter(s -> !s.isBlank()).toList(); + } + + /** + * Splits the content into sentences using the delimiter at the end of each sentence. + * {@link Config.Delimiter#END} + * @param preSplits the pre-splits by the delimiter + * @return the list of split sentences + */ + private static List splitWithDelimiterEnd(List preSplits) { + List splits = new ArrayList<>(); + + IntStream.range(0, preSplits.size() - 1) + .filter(i -> i % 2 == 0) + .forEach(i -> splits.add(preSplits.get(i).concat(preSplits.get(i + 1)))); + splits.add(preSplits.getLast()); + + return splits.stream().filter(s -> !s.isBlank()).toList(); + } + + /** + * Merges the sentences into chunks. + * @param sentences the sentences to merge + * @param config configuration for the chunker/splitter + * @return list of chunks + */ + static List mergeSentences(List sentences, Config config) { + String delimiter = config.getDelimiter(); + Integer chunkSize = config.getChunkSize(); + Integer chunkOverlap = config.getChunkOverlap(); + Boolean trimWhitespace = config.getTrimWhitespace(); + + int currentLen = 0; + int delimiterLen = delimiter.length(); + + List chunks = new ArrayList<>(); + List currentChunk = new ArrayList<>(); + + AtomicInteger chunkIndex = new AtomicInteger(0); + + for (String sentence : sentences) { + int sentenceLength = sentence.length(); + + if (currentLen + sentenceLength + (currentChunk.isEmpty() ? 0 : delimiterLen) > chunkSize) { + if (currentLen > chunkSize) { + logger.warning(String.format(LONGER_THAN_THE_SPECIFIED, currentLen, config.getChunkSize())); + } + + if (!currentChunk.isEmpty()) { + String generatedSentence = joinSentences(currentChunk, delimiter, trimWhitespace); + chunks.add(new Chunk(chunkIndex.getAndIncrement(), generatedSentence)); + + while (currentLen > chunkOverlap + || (currentLen + sentenceLength + (currentChunk.isEmpty() ? 0 : delimiterLen) > chunkSize + && currentLen > 0)) { + currentLen -= currentChunk.removeFirst().length() + (currentChunk.isEmpty() ? 0 : delimiterLen); + } + } + } + + currentChunk.add(sentence); + currentLen += sentenceLength + (currentChunk.size() > 1 ? delimiterLen : 0); + } + + if (!currentChunk.isEmpty()) { + String generatedSentence = joinSentences(currentChunk, config.getDelimiter(), config.getTrimWhitespace()); + chunks.add(new Chunk(chunkIndex.getAndIncrement(), generatedSentence)); + } + + return chunks; + } + + /** + * Joins the sentences into a single sentence. + * @param sentences the sentences to join + * @param delimiter the delimiter to join the sentences + * @param trimWhitespace whether to trim the whitespace + * @return the generated sentence + */ + private static String joinSentences(List sentences, String delimiter, Boolean trimWhitespace) { + String generatedSentence = String.join(delimiter, sentences); + if (trimWhitespace) { + generatedSentence = generatedSentence.trim(); + } + + return generatedSentence; + } + +} diff --git a/jchunk-fixed/src/test/java/jchunk/chunker/fixed/ConfigTest.java b/jchunk-fixed/src/test/java/jchunk/chunker/fixed/ConfigTest.java new file mode 100644 index 0000000..c2248b0 --- /dev/null +++ b/jchunk-fixed/src/test/java/jchunk/chunker/fixed/ConfigTest.java @@ -0,0 +1,57 @@ +package jchunk.chunker.fixed; + +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +class ConfigTest { + + @Test + void testDefaultConfig() { + Config config = Config.builder().build(); + + assertThat(config.getChunkSize()).isEqualTo(1000); + assertThat(config.getChunkOverlap()).isEqualTo(100); + assertThat(config.getDelimiter()).isEqualTo(" "); + assertThat(config.getTrimWhitespace()).isTrue(); + assertThat(config.getKeepDelimiter()).isEqualTo(Config.Delimiter.NONE); + } + + @Test + void testConfigBuilder() { + Config config = Config.builder() + .chunkSize(35) + .chunkOverlap(4) + .delimiter("") + .trimWhitespace(false) + .keepDelimiter(Config.Delimiter.START) + .build(); + + assertThat(config.getChunkSize()).isEqualTo(35); + assertThat(config.getChunkOverlap()).isEqualTo(4); + assertThat(config.getDelimiter()).isBlank(); + assertThat(config.getTrimWhitespace()).isFalse(); + assertThat(config.getKeepDelimiter()).isEqualTo(Config.Delimiter.START); + } + + @Test + void testConfigThrowErrorWhenChunkSizeIsNegative() { + assertThatThrownBy(() -> Config.builder().chunkSize(-1).build()).isInstanceOf(IllegalArgumentException.class) + .hasMessage("Chunk size must be greater than 0"); + } + + @Test + void testConfigThrowErrorWhenChunkOverlapIsNegative() { + assertThatThrownBy(() -> Config.builder().chunkOverlap(-1).build()).isInstanceOf(IllegalArgumentException.class) + .hasMessage("Chunk overlap must be greater than or equal to 0"); + } + + @Test + void testConfigThrowErrorWhenChunkOverlapIsGreaterThanChunkSize() { + assertThatThrownBy(() -> Config.builder().chunkSize(10).chunkOverlap(11).build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Chunk size must be greater than chunk overlap"); + } + +} diff --git a/jchunk-fixed/src/test/java/jchunk/chunker/fixed/FixedChunkerIT.java b/jchunk-fixed/src/test/java/jchunk/chunker/fixed/FixedChunkerIT.java new file mode 100644 index 0000000..bccc7ae --- /dev/null +++ b/jchunk-fixed/src/test/java/jchunk/chunker/fixed/FixedChunkerIT.java @@ -0,0 +1,76 @@ +package jchunk.chunker.fixed; + +import jchunk.chunker.core.chunk.Chunk; +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +class FixedChunkerIT { + + private FixedChunker chunker; + + private static final String CONTENT = "This is the text I would like to chunk up. It is the example text for this exercise"; + + @Test + void testSplitWithDefaultConfig() { + chunker = new FixedChunker(); + List expectedChunks = List + .of(new Chunk(0, "This is the text I would like to chunk up. It is the example text for this exercise")); + + List chunks = chunker.split(CONTENT); + + assertThat(chunks).isNotNull().hasSize(1); + + } + + @Test + void testSplitWithCustomConfig() { + Config config = Config.builder().chunkSize(35).chunkOverlap(4).delimiter("").build(); + + chunker = new FixedChunker(config); + + List expectedChunks = List.of(new Chunk(0, "This is the text I would like to ch"), + new Chunk(1, "o chunk up. It is the example text"), new Chunk(2, "ext for this exercise")); + + List chunks = chunker.split(CONTENT); + + assertThat(chunks).isNotNull().hasSize(3).containsExactlyElementsOf(expectedChunks); + } + + @Test + void testSplitWithCustomConfigNoWhiteSpace() { + Config config = Config.builder().chunkSize(35).chunkOverlap(0).delimiter("").trimWhitespace(false).build(); + + chunker = new FixedChunker(config); + + List expectedChunks = List.of(new Chunk(0, "This is the text I would like to ch"), + new Chunk(1, "unk up. It is the example text for "), new Chunk(2, "this exercise")); + + List chunks = chunker.split(CONTENT); + + assertThat(chunks).isNotNull().hasSize(3).containsExactlyElementsOf(expectedChunks); + } + + @Test + void testSplitWithCustomConfigWithKeepDelimiterSetToNone() { + Config config = Config.builder() + .chunkSize(35) + .chunkOverlap(0) + .delimiter("ch") + .trimWhitespace(true) + .keepDelimiter(Config.Delimiter.NONE) + .build(); + + chunker = new FixedChunker(config); + + List expectedChunks = List.of(new Chunk(0, "This is the text I would like to"), + new Chunk(1, "unk up. It is the example text for this exercise")); + + List chunks = chunker.split(CONTENT); + + assertThat(chunks).isNotNull().hasSize(2).containsExactlyElementsOf(expectedChunks); + } + +} diff --git a/jchunk-fixed/src/test/java/jchunk/chunker/fixed/UtilsTest.java b/jchunk-fixed/src/test/java/jchunk/chunker/fixed/UtilsTest.java new file mode 100644 index 0000000..b96d9bc --- /dev/null +++ b/jchunk-fixed/src/test/java/jchunk/chunker/fixed/UtilsTest.java @@ -0,0 +1,59 @@ +package jchunk.chunker.fixed; + +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +class UtilsTest { + + private static final String CONTENT = "This is the text I would like to chunk up. It is the example text for this exercise"; + + @Test + void testSplitIntoSentencesWithBlanckSeparator() { + Config config = Config.builder().delimiter("").build(); + + List sentences = Utils.splitIntoSentences(CONTENT, config); + + assertThat(sentences).isNotNull().hasSize(CONTENT.length()); + + for (int i = 0; i < CONTENT.length(); i++) { + assertThat(sentences.get(i)).isEqualTo(String.valueOf(CONTENT.charAt(i))); + } + } + + @Test + void testSplitIntoSentencesWithNoDelimiter() { + Config config = Config.builder().delimiter("ch").build(); + + List sentences = Utils.splitIntoSentences(CONTENT, config); + + assertThat(sentences).isNotNull().hasSize(2); + assertThat(sentences.getFirst()).isEqualTo("This is the text I would like to "); + assertThat(sentences.getLast()).isEqualTo("unk up. It is the example text for this exercise"); + } + + @Test + void testSplitIntoSentencesWithDelimiterStart() { + Config config = Config.builder().delimiter("ch").keepDelimiter(Config.Delimiter.START).build(); + + List sentences = Utils.splitIntoSentences(CONTENT, config); + + assertThat(sentences).isNotNull().hasSize(2); + assertThat(sentences.getFirst()).isEqualTo("This is the text I would like to "); + assertThat(sentences.getLast()).isEqualTo("chunk up. It is the example text for this exercise"); + } + + @Test + void testSplitIntoSentencesWithDelimiterEnd() { + Config config = Config.builder().delimiter("ch").keepDelimiter(Config.Delimiter.END).build(); + + List sentences = Utils.splitIntoSentences(CONTENT, config); + + assertThat(sentences).isNotNull().hasSize(2); + assertThat(sentences.getFirst()).isEqualTo("This is the text I would like to ch"); + assertThat(sentences.getLast()).isEqualTo("unk up. It is the example text for this exercise"); + } + +} diff --git a/jchunk-semantic/src/main/java/jchunk/chunker/semantic/Config.java b/jchunk-semantic/src/main/java/jchunk/chunker/semantic/Config.java index 4f1882b..6238d2e 100644 --- a/jchunk-semantic/src/main/java/jchunk/chunker/semantic/Config.java +++ b/jchunk-semantic/src/main/java/jchunk/chunker/semantic/Config.java @@ -21,6 +21,10 @@ public Integer getPercentile() { return percentile; } + public Integer getBufferSize() { + return bufferSize; + } + public Config(SentenceSplitingStrategy sentenceSplitingStrategy, Integer percentile, Integer bufferSize) { this.sentenceSplitingStrategy = sentenceSplitingStrategy; this.percentile = percentile; diff --git a/jchunk-semantic/src/main/java/jchunk/chunker/semantic/SemanticChunker.java b/jchunk-semantic/src/main/java/jchunk/chunker/semantic/SemanticChunker.java index 4e18bc6..0182659 100644 --- a/jchunk-semantic/src/main/java/jchunk/chunker/semantic/SemanticChunker.java +++ b/jchunk-semantic/src/main/java/jchunk/chunker/semantic/SemanticChunker.java @@ -33,7 +33,7 @@ public SemanticChunker(EmbeddingModel embeddingModel, Config config) { @Override public List split(String content) { List sentences = Utils.splitSentences(content, config.getSentenceSplitingStrategy()); - sentences = Utils.combineSentences(sentences, 1); + sentences = Utils.combineSentences(sentences, config.getBufferSize()); sentences = Utils.embedSentences(embeddingModel, sentences); List similarities = Utils.calculateSimilarities(sentences); List breakPoints = Utils.calculateBreakPoints(similarities, config.getPercentile()); diff --git a/jchunk-semantic/src/main/java/jchunk/chunker/semantic/Utils.java b/jchunk-semantic/src/main/java/jchunk/chunker/semantic/Utils.java index 4d2c6a9..ca7ac4a 100644 --- a/jchunk-semantic/src/main/java/jchunk/chunker/semantic/Utils.java +++ b/jchunk-semantic/src/main/java/jchunk/chunker/semantic/Utils.java @@ -1,6 +1,7 @@ package jchunk.chunker.semantic; import jchunk.chunker.core.chunk.Chunk; +import org.nd4j.common.io.Assert; import org.nd4j.linalg.api.ndarray.INDArray; import org.nd4j.linalg.factory.Nd4j; import org.springframework.ai.embedding.EmbeddingModel; @@ -19,6 +20,12 @@ */ public class Utils { + /** + * Private constructor to hide the implicit public one + */ + private Utils() { + } + /** * Split the content into sentences * @param content the content to split @@ -28,7 +35,7 @@ public static List splitSentences(String content, SentenceSplitingStra AtomicInteger index = new AtomicInteger(0); return Arrays.stream(content.split(splitingStrategy.getStrategy())) .map(sentence -> Sentence.builder().content(sentence).index(index.getAndIncrement()).build()) - .collect(Collectors.toList()); + .toList(); } /** @@ -94,7 +101,7 @@ public static List embedSentences(EmbeddingModel embeddingModel, List< Sentence sentence = sentences.get(i); sentence.setEmbedding(embeddings.get(i)); return sentence; - }).collect(Collectors.toList()); + }).toList(); } /** @@ -127,7 +134,7 @@ public static List calculateSimilarities(List sentences) { Sentence sentence1 = sentences.get(i); Sentence sentence2 = sentences.get(i + 1); return cosineSimilarity(sentence1.getEmbedding(), sentence2.getEmbedding()); - }).collect(Collectors.toList()); + }).toList(); } /** @@ -136,7 +143,7 @@ public static List calculateSimilarities(List sentences) { * @return the list of break points indices */ public static List calculateBreakPoints(List distances, Integer percentile) { - assert distances != null : "The list of distances cannot be null"; + Assert.isTrue(distances != null, "The list of distances cannot be null"); double breakpointDistanceThreshold = calculatePercentile(distances, percentile); @@ -147,8 +154,8 @@ public static List calculateBreakPoints(List distances, Integer } private static Double calculatePercentile(List distances, int percentile) { - assert distances != null : "The list of distances cannot be null"; - assert percentile > 0 && percentile < 100 : "The percentile must be between 0 and 100"; + Assert.isTrue(distances != null, "The list of distances cannot be null"); + Assert.isTrue(percentile > 0 && percentile < 100, "The percentile must be between 0 and 100"); distances = distances.stream().sorted().toList(); @@ -163,9 +170,9 @@ private static Double calculatePercentile(List distances, int percentile * @return the list of chunks */ public static List generateChunks(List sentences, List breakPoints) { - assert sentences != null : "The list of sentences cannot be null"; - assert !sentences.isEmpty() : "The list of sentences cannot be empty"; - assert breakPoints != null : "The list of break points cannot be null"; + Assert.isTrue(sentences != null, "The list of sentences cannot be null"); + Assert.isTrue(!sentences.isEmpty(), "The list of sentences cannot be empty"); + Assert.isTrue(breakPoints != null, "The list of break points cannot be null"); AtomicInteger index = new AtomicInteger(0); @@ -177,7 +184,7 @@ public static List generateChunks(List sentences, List .map(Sentence::getContent) .collect(Collectors.joining(" ")); return new Chunk(index.getAndIncrement(), content); - }).collect(Collectors.toList()); + }).toList(); } } diff --git a/jchunk-semantic/src/test/java/jchunk/chunker/semantic/SemanticChunkerIT.java b/jchunk-semantic/src/test/java/jchunk/chunker/semantic/SemanticChunkerIT.java index 8a771a7..b4e7a61 100644 --- a/jchunk-semantic/src/test/java/jchunk/chunker/semantic/SemanticChunkerIT.java +++ b/jchunk-semantic/src/test/java/jchunk/chunker/semantic/SemanticChunkerIT.java @@ -21,7 +21,7 @@ @SpringBootTest @Disabled("Only for manual testing purposes.") -public class SemanticChunkerIT { +class SemanticChunkerIT { @Autowired private SemanticChunker semanticChunker; @@ -29,11 +29,9 @@ public class SemanticChunkerIT { @Autowired private EmbeddingModel embeddingModel; - private final Integer EMBEDDING_MODEL_DIMENSION = 384; + private final String mitContent = getText("classpath:/data/mit.txt"); - private String mitContent = getText("classpath:/data/mit.txt"); - - public static String getText(String uri) { + static String getText(String uri) { var resource = new DefaultResourceLoader().getResource(uri); try { return resource.getContentAsString(StandardCharsets.UTF_8); @@ -44,19 +42,18 @@ public static String getText(String uri) { } @Test - public void documentContentLoaded() { + void documentContentLoaded() { assertThat(mitContent).isNotBlank(); } @Test - public void getSentences() { + void getSentences() { List sentences = Utils.splitSentences(mitContent, SentenceSplitingStrategy.DEFAULT); - assertThat(sentences).isNotEmpty(); - assertThat(sentences).hasSize(317); + assertThat(sentences).isNotEmpty().hasSize(317); } @Test - public void combineSentences() { + void combineSentences() { List sentences = Utils.splitSentences(mitContent, SentenceSplitingStrategy.DEFAULT); List combined = Utils.combineSentences(sentences, 1); @@ -70,24 +67,24 @@ public void combineSentences() { } @Test - public void embedChunks() { + void embedChunks() { + int EMBEDDING_MODEL_DIMENSION = 384; + List sentences = Utils.splitSentences(mitContent, SentenceSplitingStrategy.DEFAULT); List combined = Utils.combineSentences(sentences, 1); List embedded = Utils.embedSentences(embeddingModel, combined); - assertThat(embedded).isNotEmpty(); - assertThat(embedded).hasSize(317); + assertThat(embedded).isNotEmpty().hasSize(317); assertThat(embedded.getFirst().getIndex()).isEqualTo(0); assertThat(embedded.getFirst().getContent()).isEqualTo("\n\nWant to start a startup?"); assertThat(embedded.getFirst().getCombined()) .isEqualTo("\n\nWant to start a startup? Get funded by\nY Combinator."); - assertThat(embedded.getFirst().getEmbedding()).isNotNull(); - assertThat(embedded.getFirst().getEmbedding()).hasSize(EMBEDDING_MODEL_DIMENSION); + assertThat(embedded.getFirst().getEmbedding()).isNotNull().hasSize(EMBEDDING_MODEL_DIMENSION); } @Test - public void getCosineDistancesArray() { + void getCosineDistancesArray() { List sentences = Utils.splitSentences(mitContent, SentenceSplitingStrategy.DEFAULT); List combined = Utils.combineSentences(sentences, 1); List embedded = Utils.embedSentences(embeddingModel, combined); @@ -97,7 +94,7 @@ public void getCosineDistancesArray() { } @Test - public void getChunks() { + void getChunks() { List chunks = this.semanticChunker.split(mitContent); assertThat(chunks).isNotEmpty(); } diff --git a/jchunk-semantic/src/test/java/jchunk/chunker/semantic/SemanticChunkerUtilsTest.java b/jchunk-semantic/src/test/java/jchunk/chunker/semantic/SemanticChunkerUtilsTest.java index 91f3138..fda0e93 100644 --- a/jchunk-semantic/src/test/java/jchunk/chunker/semantic/SemanticChunkerUtilsTest.java +++ b/jchunk-semantic/src/test/java/jchunk/chunker/semantic/SemanticChunkerUtilsTest.java @@ -10,16 +10,16 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.AssertionsForClassTypes.assertThatThrownBy; -public class SemanticChunkerUtilsTest { +class SemanticChunkerUtilsTest { - private final EmbeddingModel embeddingModel; + final EmbeddingModel embeddingModel; - public SemanticChunkerUtilsTest() { + SemanticChunkerUtilsTest() { this.embeddingModel = Mockito.mock(EmbeddingModel.class); } @Test - public void splitSentenceDefaultStrategyTest() { + void splitSentenceDefaultStrategyTest() { List expectedResult = List.of(Sentence.builder().content("This is a test sentence.").build(), Sentence.builder().content("How are u?").build(), Sentence.builder().content("I am fine thanks\nI am a test sentence!").build(), @@ -28,8 +28,7 @@ public void splitSentenceDefaultStrategyTest() { String content = "This is a test sentence. How are u? I am fine thanks\nI am a test sentence! sure"; List result = Utils.splitSentences(content, SentenceSplitingStrategy.DEFAULT); - assertThat(result).isNotNull(); - assertThat(result.size()).isEqualTo(expectedResult.size()); + assertThat(result).isNotNull().hasSize(expectedResult.size()); for (int i = 0; i < result.size(); i++) { assertThat(result.get(i).getContent()).isEqualTo(expectedResult.get(i).getContent()); @@ -37,7 +36,7 @@ public void splitSentenceDefaultStrategyTest() { } @Test - public void splitSentenceStrategyTest() { + void splitSentenceStrategyTest() { List expectedResult = List.of( Sentence.builder().content("This is a test sentence. How are u? I am fine thanks").build(), Sentence.builder().content("I am a test sentence! sure").build()); @@ -45,15 +44,14 @@ public void splitSentenceStrategyTest() { String content = "This is a test sentence. How are u? I am fine thanks\nI am a test sentence! sure"; List result = Utils.splitSentences(content, SentenceSplitingStrategy.LINE_BREAK); - assertThat(result).isNotNull(); - assertThat(result.size()).isEqualTo(expectedResult.size()); + assertThat(result).isNotNull().hasSize(expectedResult.size()); assertThat(result.get(0).getContent()).isEqualTo(expectedResult.get(0).getContent()); assertThat(result.get(1).getContent()).isEqualTo(expectedResult.get(1).getContent()); } @Test - public void splitSentenceParagraphStrategyTest() { + void splitSentenceParagraphStrategyTest() { List expectedResult = List.of(Sentence.builder().index(0).content("This is a test sentence.").build(), Sentence.builder().index(1).content("How are u? I am fine thanks").build(), Sentence.builder().index(2).content("I am a test sentence!\nsure").build()); @@ -61,8 +59,7 @@ public void splitSentenceParagraphStrategyTest() { String content = "This is a test sentence.\n\nHow are u? I am fine thanks\n\nI am a test sentence!\nsure"; List result = Utils.splitSentences(content, SentenceSplitingStrategy.PARAGRAPH); - assertThat(result).isNotNull(); - assertThat(result.size()).isEqualTo(expectedResult.size()); + assertThat(result).isNotNull().hasSize(expectedResult.size()); assertThat(result.get(0).getContent()).isEqualTo(expectedResult.get(0).getContent()); assertThat(result.get(1).getContent()).isEqualTo(expectedResult.get(1).getContent()); @@ -70,7 +67,7 @@ public void splitSentenceParagraphStrategyTest() { } @Test - public void combineSentencesSuccessTest() { + void combineSentencesSuccessTest() { Integer bufferSize = 2; List input = List.of(Sentence.builder().index(0).content("This").build(), Sentence.builder().index(1).content("is").build(), Sentence.builder().index(2).content("a").build(), @@ -100,7 +97,7 @@ public void combineSentencesSuccessTest() { } @Test - public void combineSentencesWithBufferSizeEqualZeroTest() { + void combineSentencesWithBufferSizeEqualZeroTest() { Integer bufferSize = 0; List input = List.of(Sentence.builder().content("This").build()); @@ -109,7 +106,7 @@ public void combineSentencesWithBufferSizeEqualZeroTest() { } @Test - public void combineSentencesWithBufferSizeIsNullTest() { + void combineSentencesWithBufferSizeIsNullTest() { Integer bufferSize = null; List input = List.of(Sentence.builder().content("This").build()); @@ -118,7 +115,7 @@ public void combineSentencesWithBufferSizeIsNullTest() { } @Test - public void combineSentencesWithBufferSizeGreaterThanInputLengthTest() { + void combineSentencesWithBufferSizeGreaterThanInputLengthTest() { Integer bufferSize = 1; List input = List.of(Sentence.builder().content("This").build()); @@ -127,7 +124,7 @@ public void combineSentencesWithBufferSizeGreaterThanInputLengthTest() { } @Test - public void combineSentencesWithInputIsNullTest() { + void combineSentencesWithInputIsNullTest() { Integer bufferSize = 2; List input = null; @@ -136,7 +133,7 @@ public void combineSentencesWithInputIsNullTest() { } @Test - public void combineSentencesWithInputIsEmptyTest() { + void combineSentencesWithInputIsEmptyTest() { Integer bufferSize = 2; List input = List.of(); @@ -145,7 +142,7 @@ public void combineSentencesWithInputIsEmptyTest() { } @Test - public void embedSentencesTest() { + void embedSentencesTest() { Mockito.when(embeddingModel.embed(Mockito.anyList())) .thenReturn(List.of(List.of(1.0, 2.0, 3.0), List.of(4.0, 5.0, 6.0))); @@ -168,7 +165,7 @@ public void embedSentencesTest() { } @Test - public void testIdenticalVectors() { + void testIdenticalVectors() { List embedding1 = List.of(1.0, 2.0, 3.0); List embedding2 = List.of(1.0, 2.0, 3.0); @@ -178,7 +175,7 @@ public void testIdenticalVectors() { } @Test - public void testOrthogonalVectors() { + void testOrthogonalVectors() { List embedding1 = List.of(1.0, 0.0, 0.0); List embedding2 = List.of(0.0, 1.0, 0.0); @@ -188,7 +185,7 @@ public void testOrthogonalVectors() { } @Test - public void testOppositeVectors() { + void testOppositeVectors() { List embedding1 = List.of(1.0, 2.0, 3.0); List embedding2 = List.of(-1.0, -2.0, -3.0); @@ -198,7 +195,7 @@ public void testOppositeVectors() { } @Test - public void testDifferentMagnitudeVectors() { + void testDifferentMagnitudeVectors() { List embedding1 = List.of(1.0, 2.0, 3.0); List embedding2 = List.of(2.0, 4.0, 6.0); @@ -208,7 +205,7 @@ public void testDifferentMagnitudeVectors() { } @Test - public void testZeroVectors() { + void testZeroVectors() { List embedding1 = List.of(0.0, 0.0, 0.0); List embedding2 = List.of(0.0, 0.0, 0.0); @@ -218,7 +215,7 @@ public void testZeroVectors() { } @Test - public void testGetIndicesAboveThreshold() { + void testGetIndicesAboveThreshold() { Integer percentile = 95; List distances = List.of(10.0, 15.0, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0, 50.0, 55.0, 60.0, 65.0, 70.0, 75.0); @@ -231,7 +228,7 @@ public void testGetIndicesAboveThreshold() { } @Test - public void testGenerateChunks() { + void testGenerateChunks() { List sentences = List.of(Sentence.builder().index(0).content("This").build(), Sentence.builder().index(1).content("is").build(), Sentence.builder().index(2).content("a").build(), Sentence.builder().index(3).content("test.").build(), Sentence.builder().index(4).content("We").build(), @@ -247,8 +244,7 @@ public void testGenerateChunks() { List actualChunks = Utils.generateChunks(sentences, breakPoints); - assertThat(actualChunks).isNotNull(); - assertThat(actualChunks.size()).isEqualTo(expectedChunks.size()); + assertThat(actualChunks).isNotNull().hasSize(expectedChunks.size()); for (int i = 0; i < actualChunks.size(); i++) { assertThat(actualChunks.get(i).id()).isEqualTo(expectedChunks.get(i).id()); diff --git a/pom.xml b/pom.xml index 4f2f387..f7b1aa1 100644 --- a/pom.xml +++ b/pom.xml @@ -15,8 +15,9 @@ jchunk-core - jchunk-semantic + jchunk-fixed jchunk-recursive-character + jchunk-semantic