Skip to content

Commit

Permalink
Merge pull request #18 from PabloSanchi/develop
Browse files Browse the repository at this point in the history
Feat: add recursive chunker
  • Loading branch information
PabloSanchi authored Aug 18, 2024
2 parents e845835 + 68fbce0 commit b38b056
Show file tree
Hide file tree
Showing 9 changed files with 655 additions and 1 deletion.
26 changes: 26 additions & 0 deletions jchunk-core/src/main/java/jchunk/chunker/core/chunk/Chunk.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,30 @@
* @author Pablo Sanchidrian Herrera
*/
public record Chunk(Integer id, String content) {

public static Builder builder() {
return new Builder();
}

public static class Builder {

private Integer id;

private String content;

public Builder id(Integer id) {
this.id = id;
return this;
}

public Builder content(String content) {
this.content = content;
return this;
}

public Chunk build() {
return new Chunk(id, content);
}

}
}
43 changes: 43 additions & 0 deletions jchunk-recursive-character/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.github.PabloSanchi</groupId>
<artifactId>jchunk</artifactId>
<version>0.0.1-SNAPSHOT</version>
</parent>

<artifactId>jchunk-recursive-character</artifactId>
<packaging>jar</packaging>
<name>JChunk - Recursive Character Chunker</name>
<description>Recuersive Character Chunker for Java</description>
<url>https://github.com/PabloSanchi/jchunk</url>

<scm>
<url>https://github.com/PabloSanchi/jchunk</url>
<connection>git://github.com/PabloSanchi/jchunk.git</connection>
<developerConnection>git@github.com:PabloSanchi/jchunk.git</developerConnection>
</scm>

<dependencies>
<dependency>
<groupId>com.github.PabloSanchi</groupId>
<artifactId>jchunk-core</artifactId>
<version>${project.parent.version}</version>
</dependency>

<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot</artifactId>
</dependency>

<!-- test -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
</dependency>
</dependencies>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
package jchunk.chunker.recursive;

import org.springframework.util.Assert;

import java.util.ArrayList;
import java.util.List;

/**
* {@link Config} is a class that holds the configuration for the
* {@link RecursiveCharacterChunker}.
*
* @author Pablo Sanchidrian Herrera
*/
public class Config {

private final Integer chunkSize;

private final Integer chunkOverlap;

private final List<String> delimiters;

private final Delimiter keepDelimiter;

private final Boolean trimWhitespace;

public Integer getChunkSize() {
return chunkSize;
}

public Integer getChunkOverlap() {
return chunkOverlap;
}

public List<String> getDelimiters() {
return delimiters;
}

public Delimiter getKeepDelimiter() {
return keepDelimiter;
}

public Boolean getTrimWhitespace() {
return trimWhitespace;
}

private Config(Integer chunkSize, Integer chunkOverlap, List<String> delimiters, Delimiter keepDelimiter,
Boolean trimWhitespace) {
this.chunkSize = chunkSize;
this.chunkOverlap = chunkOverlap;
this.delimiters = delimiters;
this.keepDelimiter = keepDelimiter;
this.trimWhitespace = trimWhitespace;
}

/**
* {@return the default config}
*/
public static Config defaultConfig() {
return builder().build();
}

public static Builder builder() {
return new Builder();
}

public static class Builder {

private Integer chunkSize = 100;

private Integer chunkOverlap = 20;

private List<String> delimiters = new ArrayList<>(List.of("\n\n", "\n", " ", ""));

private Delimiter keepDelimiter = Delimiter.START;

private Boolean trimWhitespace = true;

public Builder chunkSize(Integer chunkSize) {
Assert.isTrue(chunkSize > 0, "Chunk size must be greater than 0");
this.chunkSize = chunkSize;
return this;
}

public Builder chunkOverlap(Integer chunkOverlap) {
Assert.isTrue(chunkOverlap >= 0, "Chunk overlap must be greater than or equal to 0");
this.chunkOverlap = chunkOverlap;
return this;
}

public Builder separators(List<String> delimiters) {
this.delimiters = delimiters;
return this;
}

public Builder keepDelimiter(Delimiter keepDelimiter) {
this.keepDelimiter = keepDelimiter;
return this;
}

public Builder trimWhitespace(Boolean trimWhitespace) {
this.trimWhitespace = trimWhitespace;
return this;
}

public Config build() {
Assert.isTrue(chunkSize > chunkOverlap, "Chunk size must be greater than chunk overlap");
return new Config(chunkSize, chunkOverlap, delimiters, keepDelimiter, trimWhitespace);
}

}

/**
* Enum to represent the delimiter configuration
* <p>
* <ul>
* <li>NONE: No delimiter</li>
* <li>START: Delimiter at the start of the chunk</li>
* <li>END: Delimiter at the end of the chunk</li>
* </ul>
*/
public enum Delimiter {

NONE, START, END

}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package jchunk.chunker.recursive;

import jchunk.chunker.core.chunk.Chunk;
import jchunk.chunker.core.chunk.IChunker;

import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;

/**
* {@link RecursiveCharacterChunker} is a class that implements the {@link IChunker}
* interface and splits a text into chunks recursively with the given separators.
*
* @author Pablo Sanchidrian Herrera
*/
public class RecursiveCharacterChunker implements IChunker {

private final Config config;

public RecursiveCharacterChunker() {
this(Config.defaultConfig());
}

public RecursiveCharacterChunker(Config config) {
this.config = config;
}

@Override
public List<Chunk> split(String content) {
return Utils.splitContent(content, config.getChunkSize(), config.getChunkOverlap(), config.getKeepDelimiter(),
config.getDelimiters(), config.getTrimWhitespace(), new AtomicInteger(0));
}

}
Loading

0 comments on commit b38b056

Please sign in to comment.