-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #18 from PabloSanchi/develop
Feat: add recursive chunker
- Loading branch information
Showing
9 changed files
with
655 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
<parent> | ||
<groupId>com.github.PabloSanchi</groupId> | ||
<artifactId>jchunk</artifactId> | ||
<version>0.0.1-SNAPSHOT</version> | ||
</parent> | ||
|
||
<artifactId>jchunk-recursive-character</artifactId> | ||
<packaging>jar</packaging> | ||
<name>JChunk - Recursive Character Chunker</name> | ||
<description>Recuersive Character Chunker for Java</description> | ||
<url>https://github.com/PabloSanchi/jchunk</url> | ||
|
||
<scm> | ||
<url>https://github.com/PabloSanchi/jchunk</url> | ||
<connection>git://github.com/PabloSanchi/jchunk.git</connection> | ||
<developerConnection>git@github.com:PabloSanchi/jchunk.git</developerConnection> | ||
</scm> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>com.github.PabloSanchi</groupId> | ||
<artifactId>jchunk-core</artifactId> | ||
<version>${project.parent.version}</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>org.springframework.boot</groupId> | ||
<artifactId>spring-boot</artifactId> | ||
</dependency> | ||
|
||
<!-- test --> | ||
<dependency> | ||
<groupId>org.springframework.boot</groupId> | ||
<artifactId>spring-boot-starter-test</artifactId> | ||
</dependency> | ||
</dependencies> | ||
|
||
</project> |
127 changes: 127 additions & 0 deletions
127
jchunk-recursive-character/src/main/java/jchunk/chunker/recursive/Config.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
package jchunk.chunker.recursive; | ||
|
||
import org.springframework.util.Assert; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
/** | ||
* {@link Config} is a class that holds the configuration for the | ||
* {@link RecursiveCharacterChunker}. | ||
* | ||
* @author Pablo Sanchidrian Herrera | ||
*/ | ||
public class Config { | ||
|
||
private final Integer chunkSize; | ||
|
||
private final Integer chunkOverlap; | ||
|
||
private final List<String> delimiters; | ||
|
||
private final Delimiter keepDelimiter; | ||
|
||
private final Boolean trimWhitespace; | ||
|
||
public Integer getChunkSize() { | ||
return chunkSize; | ||
} | ||
|
||
public Integer getChunkOverlap() { | ||
return chunkOverlap; | ||
} | ||
|
||
public List<String> getDelimiters() { | ||
return delimiters; | ||
} | ||
|
||
public Delimiter getKeepDelimiter() { | ||
return keepDelimiter; | ||
} | ||
|
||
public Boolean getTrimWhitespace() { | ||
return trimWhitespace; | ||
} | ||
|
||
private Config(Integer chunkSize, Integer chunkOverlap, List<String> delimiters, Delimiter keepDelimiter, | ||
Boolean trimWhitespace) { | ||
this.chunkSize = chunkSize; | ||
this.chunkOverlap = chunkOverlap; | ||
this.delimiters = delimiters; | ||
this.keepDelimiter = keepDelimiter; | ||
this.trimWhitespace = trimWhitespace; | ||
} | ||
|
||
/** | ||
* {@return the default config} | ||
*/ | ||
public static Config defaultConfig() { | ||
return builder().build(); | ||
} | ||
|
||
public static Builder builder() { | ||
return new Builder(); | ||
} | ||
|
||
public static class Builder { | ||
|
||
private Integer chunkSize = 100; | ||
|
||
private Integer chunkOverlap = 20; | ||
|
||
private List<String> delimiters = new ArrayList<>(List.of("\n\n", "\n", " ", "")); | ||
|
||
private Delimiter keepDelimiter = Delimiter.START; | ||
|
||
private Boolean trimWhitespace = true; | ||
|
||
public Builder chunkSize(Integer chunkSize) { | ||
Assert.isTrue(chunkSize > 0, "Chunk size must be greater than 0"); | ||
this.chunkSize = chunkSize; | ||
return this; | ||
} | ||
|
||
public Builder chunkOverlap(Integer chunkOverlap) { | ||
Assert.isTrue(chunkOverlap >= 0, "Chunk overlap must be greater than or equal to 0"); | ||
this.chunkOverlap = chunkOverlap; | ||
return this; | ||
} | ||
|
||
public Builder separators(List<String> delimiters) { | ||
this.delimiters = delimiters; | ||
return this; | ||
} | ||
|
||
public Builder keepDelimiter(Delimiter keepDelimiter) { | ||
this.keepDelimiter = keepDelimiter; | ||
return this; | ||
} | ||
|
||
public Builder trimWhitespace(Boolean trimWhitespace) { | ||
this.trimWhitespace = trimWhitespace; | ||
return this; | ||
} | ||
|
||
public Config build() { | ||
Assert.isTrue(chunkSize > chunkOverlap, "Chunk size must be greater than chunk overlap"); | ||
return new Config(chunkSize, chunkOverlap, delimiters, keepDelimiter, trimWhitespace); | ||
} | ||
|
||
} | ||
|
||
/** | ||
* Enum to represent the delimiter configuration | ||
* <p> | ||
* <ul> | ||
* <li>NONE: No delimiter</li> | ||
* <li>START: Delimiter at the start of the chunk</li> | ||
* <li>END: Delimiter at the end of the chunk</li> | ||
* </ul> | ||
*/ | ||
public enum Delimiter { | ||
|
||
NONE, START, END | ||
|
||
} | ||
|
||
} |
33 changes: 33 additions & 0 deletions
33
...recursive-character/src/main/java/jchunk/chunker/recursive/RecursiveCharacterChunker.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
package jchunk.chunker.recursive; | ||
|
||
import jchunk.chunker.core.chunk.Chunk; | ||
import jchunk.chunker.core.chunk.IChunker; | ||
|
||
import java.util.List; | ||
import java.util.concurrent.atomic.AtomicInteger; | ||
|
||
/** | ||
* {@link RecursiveCharacterChunker} is a class that implements the {@link IChunker} | ||
* interface and splits a text into chunks recursively with the given separators. | ||
* | ||
* @author Pablo Sanchidrian Herrera | ||
*/ | ||
public class RecursiveCharacterChunker implements IChunker { | ||
|
||
private final Config config; | ||
|
||
public RecursiveCharacterChunker() { | ||
this(Config.defaultConfig()); | ||
} | ||
|
||
public RecursiveCharacterChunker(Config config) { | ||
this.config = config; | ||
} | ||
|
||
@Override | ||
public List<Chunk> split(String content) { | ||
return Utils.splitContent(content, config.getChunkSize(), config.getChunkOverlap(), config.getKeepDelimiter(), | ||
config.getDelimiters(), config.getTrimWhitespace(), new AtomicInteger(0)); | ||
} | ||
|
||
} |
Oops, something went wrong.