Skip to content

Commit

Permalink
feat: add recursive chunker, needs refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
PabloSanchi committed Aug 18, 2024
1 parent f9bb29f commit b062dad
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import jchunk.chunker.core.chunk.IChunker;

import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;

/**
* {@link RecursiveCharacterChunker} is a class that implements the {@link IChunker}
Expand All @@ -26,7 +27,7 @@ public RecursiveCharacterChunker(Config config) {
@Override
public List<Chunk> split(String content) {
return Utils.splitContent(content, config.getChunkSize(), config.getChunkOverlap(), config.getKeepDelimiter(),
config.getDelimiters(), config.getTrimWhitespace());
config.getDelimiters(), config.getTrimWhitespace(), new AtomicInteger(0));
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ private Utils() {
/**
*/
public static List<Chunk> splitContent(String content, Integer chunkSize, Integer chunkOverlap,
Config.Delimiter keepDelimiter, List<String> delimiters, Boolean trimWhitespace) {
Config.Delimiter keepDelimiter, List<String> delimiters, Boolean trimWhitespace, AtomicInteger index) {

List<String> newDelimiters = new ArrayList<>(delimiters);
String delimiter = getBestMatchingDelimiter(content, newDelimiters);
Expand All @@ -49,25 +49,25 @@ public static List<Chunk> splitContent(String content, Integer chunkSize, Intege
else {
if (!goodSplits.isEmpty()) {
List<Chunk> generatedChunks = mergeSentences(goodSplits, delimiterToUse, chunkSize, chunkOverlap,
trimWhitespace);
trimWhitespace, index);
chunks.addAll(generatedChunks);
goodSplits.clear();
}

if (newDelimiters.isEmpty()) {
chunks.add(new Chunk(0, split));
chunks.add(new Chunk(index.getAndIncrement(), split));
}
else {
List<Chunk> generatedChunks = splitContent(split, chunkSize, chunkOverlap, keepDelimiter,
newDelimiters, trimWhitespace);
newDelimiters, trimWhitespace, index);
chunks.addAll(generatedChunks);
}
}
}

if (!goodSplits.isEmpty()) {
List<Chunk> generatedChunks = mergeSentences(goodSplits, delimiterToUse, chunkSize, chunkOverlap,
trimWhitespace);
trimWhitespace, index);
chunks.addAll(generatedChunks);
}

Expand Down Expand Up @@ -162,16 +162,14 @@ private static List<String> splitWithDelimiterEnd(List<String> preSplits) {
* @return list of chunks
*/
static List<Chunk> mergeSentences(List<String> sentences, String delimiter, Integer chunkSize, Integer chunkOverlap,
Boolean trimWhitespace) {
Boolean trimWhitespace, AtomicInteger index) {

int currentLen = 0;
int delimiterLen = delimiter.length();

List<Chunk> chunks = new ArrayList<>();
List<String> currentChunk = new ArrayList<>();

AtomicInteger chunkIndex = new AtomicInteger(0);

for (String sentence : sentences) {
int sentenceLength = sentence.length();

Expand All @@ -182,7 +180,7 @@ static List<Chunk> mergeSentences(List<String> sentences, String delimiter, Inte

if (!currentChunk.isEmpty()) {
String generatedSentence = joinSentences(currentChunk, delimiter, trimWhitespace);
chunks.add(new Chunk(chunkIndex.getAndIncrement(), generatedSentence));
chunks.add(new Chunk(index.getAndIncrement(), generatedSentence));

while (currentLen > chunkOverlap
|| (currentLen + sentenceLength + (currentChunk.isEmpty() ? 0 : delimiterLen) > chunkSize
Expand All @@ -198,7 +196,7 @@ static List<Chunk> mergeSentences(List<String> sentences, String delimiter, Inte

if (!currentChunk.isEmpty()) {
String generatedSentence = joinSentences(currentChunk, delimiter, trimWhitespace);
chunks.add(new Chunk(chunkIndex.getAndIncrement(), generatedSentence));
chunks.add(new Chunk(index.getAndIncrement(), generatedSentence));
}

return chunks;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ void testSplit() {
assertThat(chunks).isNotNull().hasSize(expectedChunks.size());

for (int i = 0; i < chunks.size(); i++) {
// assertThat(chunks.get(i).id()).isEqualTo(expectedChunks.get(i).id());
assertThat(chunks.get(i).id()).isEqualTo(expectedChunks.get(i).id());
assertThat(chunks.get(i).content()).isEqualTo(expectedChunks.get(i).content());
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import org.junit.jupiter.api.Test;

import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;

class UtilsTest {

Expand All @@ -23,7 +24,7 @@ void splitText() {
Config config = Config.builder().chunkSize(15).build();

List<Chunk> sentences = Utils.splitContent(content, config.getChunkSize(), config.getChunkOverlap(),
config.getKeepDelimiter(), config.getDelimiters(), config.getTrimWhitespace());
config.getKeepDelimiter(), config.getDelimiters(), config.getTrimWhitespace(), new AtomicInteger(0));
}

}

0 comments on commit b062dad

Please sign in to comment.