From b062dad87fc331816cae09ad65418b44c4895503 Mon Sep 17 00:00:00 2001 From: Pablo Sanchidrian Date: Sun, 18 Aug 2024 18:07:49 +0200 Subject: [PATCH] feat: add recursive chunker, needs refactoring --- .../recursive/RecursiveCharacterChunker.java | 3 ++- .../java/jchunk/chunker/recursive/Utils.java | 18 ++++++++---------- .../RecursiveCharacterChunkerTest.java | 2 +- .../jchunk/chunker/recursive/UtilsTest.java | 3 ++- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/jchunk-recursive-character/src/main/java/jchunk/chunker/recursive/RecursiveCharacterChunker.java b/jchunk-recursive-character/src/main/java/jchunk/chunker/recursive/RecursiveCharacterChunker.java index 7c2511a..4acdd8d 100644 --- a/jchunk-recursive-character/src/main/java/jchunk/chunker/recursive/RecursiveCharacterChunker.java +++ b/jchunk-recursive-character/src/main/java/jchunk/chunker/recursive/RecursiveCharacterChunker.java @@ -4,6 +4,7 @@ import jchunk.chunker.core.chunk.IChunker; import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; /** * {@link RecursiveCharacterChunker} is a class that implements the {@link IChunker} @@ -26,7 +27,7 @@ public RecursiveCharacterChunker(Config config) { @Override public List split(String content) { return Utils.splitContent(content, config.getChunkSize(), config.getChunkOverlap(), config.getKeepDelimiter(), - config.getDelimiters(), config.getTrimWhitespace()); + config.getDelimiters(), config.getTrimWhitespace(), new AtomicInteger(0)); } } diff --git a/jchunk-recursive-character/src/main/java/jchunk/chunker/recursive/Utils.java b/jchunk-recursive-character/src/main/java/jchunk/chunker/recursive/Utils.java index 999bad2..627f1c2 100644 --- a/jchunk-recursive-character/src/main/java/jchunk/chunker/recursive/Utils.java +++ b/jchunk-recursive-character/src/main/java/jchunk/chunker/recursive/Utils.java @@ -30,7 +30,7 @@ private Utils() { /** */ public static List splitContent(String content, Integer chunkSize, Integer chunkOverlap, - Config.Delimiter keepDelimiter, List delimiters, Boolean trimWhitespace) { + Config.Delimiter keepDelimiter, List delimiters, Boolean trimWhitespace, AtomicInteger index) { List newDelimiters = new ArrayList<>(delimiters); String delimiter = getBestMatchingDelimiter(content, newDelimiters); @@ -49,17 +49,17 @@ public static List splitContent(String content, Integer chunkSize, Intege else { if (!goodSplits.isEmpty()) { List generatedChunks = mergeSentences(goodSplits, delimiterToUse, chunkSize, chunkOverlap, - trimWhitespace); + trimWhitespace, index); chunks.addAll(generatedChunks); goodSplits.clear(); } if (newDelimiters.isEmpty()) { - chunks.add(new Chunk(0, split)); + chunks.add(new Chunk(index.getAndIncrement(), split)); } else { List generatedChunks = splitContent(split, chunkSize, chunkOverlap, keepDelimiter, - newDelimiters, trimWhitespace); + newDelimiters, trimWhitespace, index); chunks.addAll(generatedChunks); } } @@ -67,7 +67,7 @@ public static List splitContent(String content, Integer chunkSize, Intege if (!goodSplits.isEmpty()) { List generatedChunks = mergeSentences(goodSplits, delimiterToUse, chunkSize, chunkOverlap, - trimWhitespace); + trimWhitespace, index); chunks.addAll(generatedChunks); } @@ -162,7 +162,7 @@ private static List splitWithDelimiterEnd(List preSplits) { * @return list of chunks */ static List mergeSentences(List sentences, String delimiter, Integer chunkSize, Integer chunkOverlap, - Boolean trimWhitespace) { + Boolean trimWhitespace, AtomicInteger index) { int currentLen = 0; int delimiterLen = delimiter.length(); @@ -170,8 +170,6 @@ static List mergeSentences(List sentences, String delimiter, Inte List chunks = new ArrayList<>(); List currentChunk = new ArrayList<>(); - AtomicInteger chunkIndex = new AtomicInteger(0); - for (String sentence : sentences) { int sentenceLength = sentence.length(); @@ -182,7 +180,7 @@ static List mergeSentences(List sentences, String delimiter, Inte if (!currentChunk.isEmpty()) { String generatedSentence = joinSentences(currentChunk, delimiter, trimWhitespace); - chunks.add(new Chunk(chunkIndex.getAndIncrement(), generatedSentence)); + chunks.add(new Chunk(index.getAndIncrement(), generatedSentence)); while (currentLen > chunkOverlap || (currentLen + sentenceLength + (currentChunk.isEmpty() ? 0 : delimiterLen) > chunkSize @@ -198,7 +196,7 @@ static List mergeSentences(List sentences, String delimiter, Inte if (!currentChunk.isEmpty()) { String generatedSentence = joinSentences(currentChunk, delimiter, trimWhitespace); - chunks.add(new Chunk(chunkIndex.getAndIncrement(), generatedSentence)); + chunks.add(new Chunk(index.getAndIncrement(), generatedSentence)); } return chunks; diff --git a/jchunk-recursive-character/src/test/java/jchunk/chunker/recursive/RecursiveCharacterChunkerTest.java b/jchunk-recursive-character/src/test/java/jchunk/chunker/recursive/RecursiveCharacterChunkerTest.java index af07603..7ce674e 100644 --- a/jchunk-recursive-character/src/test/java/jchunk/chunker/recursive/RecursiveCharacterChunkerTest.java +++ b/jchunk-recursive-character/src/test/java/jchunk/chunker/recursive/RecursiveCharacterChunkerTest.java @@ -44,7 +44,7 @@ void testSplit() { assertThat(chunks).isNotNull().hasSize(expectedChunks.size()); for (int i = 0; i < chunks.size(); i++) { - // assertThat(chunks.get(i).id()).isEqualTo(expectedChunks.get(i).id()); + assertThat(chunks.get(i).id()).isEqualTo(expectedChunks.get(i).id()); assertThat(chunks.get(i).content()).isEqualTo(expectedChunks.get(i).content()); } } diff --git a/jchunk-recursive-character/src/test/java/jchunk/chunker/recursive/UtilsTest.java b/jchunk-recursive-character/src/test/java/jchunk/chunker/recursive/UtilsTest.java index f811a28..a592c01 100644 --- a/jchunk-recursive-character/src/test/java/jchunk/chunker/recursive/UtilsTest.java +++ b/jchunk-recursive-character/src/test/java/jchunk/chunker/recursive/UtilsTest.java @@ -6,6 +6,7 @@ import org.junit.jupiter.api.Test; import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; class UtilsTest { @@ -23,7 +24,7 @@ void splitText() { Config config = Config.builder().chunkSize(15).build(); List sentences = Utils.splitContent(content, config.getChunkSize(), config.getChunkOverlap(), - config.getKeepDelimiter(), config.getDelimiters(), config.getTrimWhitespace()); + config.getKeepDelimiter(), config.getDelimiters(), config.getTrimWhitespace(), new AtomicInteger(0)); } }