-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #10 from PabloSanchi/feature/fixed-chunker
Feature: add fixed chunker
- Loading branch information
Showing
9 changed files
with
485 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
<parent> | ||
<groupId>com.github.PabloSanchi</groupId> | ||
<artifactId>jchunk</artifactId> | ||
<version>0.0.1-SNAPSHOT</version> | ||
</parent> | ||
|
||
<artifactId>jchunk-fixed</artifactId> | ||
<packaging>jar</packaging> | ||
<name>JChunk - Fixed Chunker</name> | ||
<description>Fixed Chunker for Java</description> | ||
<url>https://github.com/PabloSanchi/jchunk</url> | ||
|
||
<scm> | ||
<url>https://github.com/PabloSanchi/jchunk</url> | ||
<connection>git://github.com/PabloSanchi/jchunk.git</connection> | ||
<developerConnection>git@github.com:PabloSanchi/jchunk.git</developerConnection> | ||
</scm> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>com.github.PabloSanchi</groupId> | ||
<artifactId>jchunk-core</artifactId> | ||
<version>${project.parent.version}</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.springframework.boot</groupId> | ||
<artifactId>spring-boot</artifactId> | ||
</dependency> | ||
|
||
<!-- test --> | ||
<dependency> | ||
<groupId>org.springframework.boot</groupId> | ||
<artifactId>spring-boot-starter-test</artifactId> | ||
</dependency> | ||
</dependencies> | ||
|
||
</project> |
114 changes: 114 additions & 0 deletions
114
jchunk-fixed/src/main/java/jchunk/chunker/fixed/Config.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
package jchunk.chunker.fixed; | ||
|
||
/** | ||
* Configuration for the fixed chunker | ||
* | ||
* @author Pablo Sanchidrian Herrera | ||
*/ | ||
public class Config { | ||
|
||
private final Integer chunkSize; | ||
|
||
private final Integer chunkOverlap; | ||
|
||
private final String delimiter; | ||
|
||
private final Boolean trimWhitespace; | ||
|
||
private final Delimiter keepDelimiter; | ||
|
||
public Integer getChunkSize() { | ||
return chunkSize; | ||
} | ||
|
||
public Integer getChunkOverlap() { | ||
return chunkOverlap; | ||
} | ||
|
||
public String getDelimiter() { | ||
return delimiter; | ||
} | ||
|
||
public Boolean getTrimWhitespace() { | ||
return trimWhitespace; | ||
} | ||
|
||
public Delimiter getKeepDelimiter() { | ||
return keepDelimiter; | ||
} | ||
|
||
public Config(Integer chunkSize, Integer chunkOverlap, String delimiter, Boolean trimWhitespace, | ||
Delimiter keepDelimiter) { | ||
this.chunkSize = chunkSize; | ||
this.chunkOverlap = chunkOverlap; | ||
this.delimiter = delimiter; | ||
this.trimWhitespace = trimWhitespace; | ||
this.keepDelimiter = keepDelimiter; | ||
} | ||
|
||
/** | ||
* {@return the default config} | ||
*/ | ||
public static Config defaultConfig() { | ||
return builder().build(); | ||
} | ||
|
||
public static Builder builder() { | ||
return new Builder(); | ||
} | ||
|
||
public static class Builder { | ||
|
||
private Integer chunkSize = 1000; | ||
|
||
private Integer chunkOverlap = 100; | ||
|
||
private String delimiter = " "; | ||
|
||
private Boolean trimWhitespace = true; | ||
|
||
private Delimiter keepDelimiter = Delimiter.NONE; | ||
|
||
public Builder chunkSize(Integer chunkSize) { | ||
this.chunkSize = chunkSize; | ||
return this; | ||
} | ||
|
||
public Builder chunkOverlap(Integer chunkOverlap) { | ||
this.chunkOverlap = chunkOverlap; | ||
return this; | ||
} | ||
|
||
public Builder delimiter(String delimiter) { | ||
this.delimiter = delimiter; | ||
return this; | ||
} | ||
|
||
public Builder trimWhitespace(Boolean trimWhitespace) { | ||
this.trimWhitespace = trimWhitespace; | ||
return this; | ||
} | ||
|
||
public Builder keepDelimiter(Delimiter keepDelimiter) { | ||
this.keepDelimiter = keepDelimiter; | ||
return this; | ||
} | ||
|
||
public Config build() { | ||
assert chunkSize > chunkOverlap : "Chunk size must be greater than chunk overlap"; | ||
return new Config(chunkSize, chunkOverlap, delimiter, trimWhitespace, keepDelimiter); | ||
} | ||
|
||
} | ||
|
||
/** | ||
* Enum to represent the delimiter configuration NONE: No delimiter START: Delimiter | ||
* at the start of the chunk END: Delimiter at the end of the chunk | ||
*/ | ||
public enum Delimiter { | ||
|
||
NONE, START, END | ||
|
||
} | ||
|
||
} |
31 changes: 31 additions & 0 deletions
31
jchunk-fixed/src/main/java/jchunk/chunker/fixed/FixedChunker.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
package jchunk.chunker.fixed; | ||
|
||
import jchunk.chunker.core.chunk.Chunk; | ||
import jchunk.chunker.core.chunk.IChunker; | ||
|
||
import java.util.List; | ||
|
||
/** | ||
* {@link FixedChunker} is a chunker that splits the content into fixed size chunks. | ||
* | ||
* @author Pablo Sanchidrian Herrera | ||
*/ | ||
public class FixedChunker implements IChunker { | ||
|
||
private final Config config; | ||
|
||
public FixedChunker() { | ||
this(Config.defaultConfig()); | ||
} | ||
|
||
public FixedChunker(Config config) { | ||
this.config = config; | ||
} | ||
|
||
@Override | ||
public List<Chunk> split(String content) { | ||
List<String> sentences = Utils.splitIntoSentences(content, config); | ||
return Utils.mergeSentences(sentences, config); | ||
} | ||
|
||
} |
114 changes: 114 additions & 0 deletions
114
jchunk-fixed/src/main/java/jchunk/chunker/fixed/Utils.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
package jchunk.chunker.fixed; | ||
|
||
import jchunk.chunker.core.chunk.Chunk; | ||
|
||
import java.util.ArrayList; | ||
import java.util.Arrays; | ||
import java.util.List; | ||
import java.util.concurrent.atomic.AtomicInteger; | ||
import java.util.logging.Logger; | ||
import java.util.stream.Collectors; | ||
import java.util.stream.IntStream; | ||
|
||
public class Utils { | ||
|
||
private static final Logger logger = Logger.getLogger(Utils.class.getName()); | ||
|
||
public static final String LONGER_THAN_THE_SPECIFIED_ = "Created a chunk of size %d, which is longer than the specified %d"; | ||
|
||
/** | ||
* Splits the content into sentences using the delimiter. | ||
* @param content the content to split | ||
* @param config configuration for the chunker/splitter | ||
* @return a list of split sentences | ||
*/ | ||
public static List<String> splitIntoSentences(String content, Config config) { | ||
String delimiter = config.getDelimiter(); | ||
Config.Delimiter keepDelimiter = config.getKeepDelimiter(); | ||
|
||
if (delimiter.isBlank()) { | ||
return content.chars().mapToObj(c -> String.valueOf((char) c)).collect(Collectors.toList()); | ||
} | ||
|
||
if (keepDelimiter != Config.Delimiter.NONE) { | ||
String withDelimiter = "((?<=%1$s)|(?=%1$s))"; | ||
List<String> preSplits = new ArrayList<>(List.of(content.split(String.format(withDelimiter, delimiter)))); | ||
List<String> splits = new ArrayList<>(); | ||
|
||
if (keepDelimiter == Config.Delimiter.START) { | ||
splits.add(preSplits.getFirst()); | ||
IntStream.range(1, preSplits.size()) | ||
.filter(i -> i % 2 == 1) | ||
.forEach(i -> splits.add(preSplits.get(i).concat(preSplits.get(i + 1)))); | ||
} | ||
else { | ||
IntStream.range(0, preSplits.size() - 1) | ||
.filter(i -> i % 2 == 0) | ||
.forEach(i -> splits.add(preSplits.get(i).concat(preSplits.get(i + 1)))); | ||
splits.add(preSplits.getLast()); | ||
} | ||
|
||
return splits.stream().filter(s -> !s.isBlank()).toList(); | ||
} | ||
|
||
return Arrays.stream(content.split(delimiter)).filter(s -> !s.isBlank()).toList(); | ||
} | ||
|
||
/** | ||
* Merges the sentences into chunks. | ||
* @param sentences the sentences to merge | ||
* @param config configuration for the chunker/splitter | ||
* @return list of chunks | ||
*/ | ||
static List<Chunk> mergeSentences(List<String> sentences, Config config) { | ||
String delimiter = config.getDelimiter(); | ||
Integer chunkSize = config.getChunkSize(); | ||
Integer chunkOverlap = config.getChunkOverlap(); | ||
Boolean trimWhitespace = config.getTrimWhitespace(); | ||
|
||
int currentLen = 0; | ||
int delimiterLen = delimiter.length(); | ||
|
||
List<Chunk> chunks = new ArrayList<>(); | ||
List<String> currentChunk = new ArrayList<>(); | ||
|
||
AtomicInteger chunkIndex = new AtomicInteger(0); | ||
|
||
for (String sentence : sentences) { | ||
int sentenceLength = sentence.length(); | ||
|
||
if (currentLen + sentenceLength + (currentChunk.isEmpty() ? 0 : delimiterLen) > chunkSize) { | ||
if (currentLen > chunkSize) { | ||
logger.warning(String.format(LONGER_THAN_THE_SPECIFIED_, currentLen, config.getChunkSize())); | ||
} | ||
|
||
if (!currentChunk.isEmpty()) { | ||
String generatedSentence = joinSentences(currentChunk, delimiter, trimWhitespace); | ||
chunks.add(new Chunk(chunkIndex.getAndIncrement(), generatedSentence)); | ||
|
||
while (currentLen > chunkOverlap | ||
|| (currentLen + sentenceLength + (currentChunk.isEmpty() ? 0 : delimiterLen) > chunkSize | ||
&& currentLen > 0)) { | ||
currentLen -= currentChunk.removeFirst().length() + (currentChunk.isEmpty() ? 0 : delimiterLen); | ||
} | ||
} | ||
} | ||
|
||
currentChunk.add(sentence); | ||
currentLen += sentenceLength + (currentChunk.size() > 1 ? delimiterLen : 0); | ||
} | ||
|
||
if (!currentChunk.isEmpty()) { | ||
String generatedSentence = joinSentences(currentChunk, config.getDelimiter(), config.getTrimWhitespace()); | ||
chunks.add(new Chunk(chunkIndex.getAndIncrement(), generatedSentence)); | ||
} | ||
|
||
return chunks; | ||
} | ||
|
||
private static String joinSentences(List<String> sentences, String delimiter, Boolean trimWhitespace) { | ||
String generatedSentence = String.join(delimiter, sentences); | ||
return trimWhitespace ? generatedSentence.trim() : generatedSentence; | ||
} | ||
|
||
} |
37 changes: 37 additions & 0 deletions
37
jchunk-fixed/src/test/java/jchunk/chunker/fixed/ConfigTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
package jchunk.chunker.fixed; | ||
|
||
import org.junit.jupiter.api.Test; | ||
|
||
import static org.assertj.core.api.Assertions.assertThat; | ||
|
||
public class ConfigTest { | ||
|
||
@Test | ||
public void testDefaultConfig() { | ||
Config config = Config.builder().build(); | ||
|
||
assertThat(config.getChunkSize()).isEqualTo(1000); | ||
assertThat(config.getChunkOverlap()).isEqualTo(100); | ||
assertThat(config.getDelimiter()).isEqualTo(" "); | ||
assertThat(config.getTrimWhitespace()).isTrue(); | ||
assertThat(config.getKeepDelimiter()).isEqualTo(Config.Delimiter.NONE); | ||
} | ||
|
||
@Test | ||
public void testConfigBuilder() { | ||
Config config = Config.builder() | ||
.chunkSize(35) | ||
.chunkOverlap(4) | ||
.delimiter("") | ||
.trimWhitespace(false) | ||
.keepDelimiter(Config.Delimiter.START) | ||
.build(); | ||
|
||
assertThat(config.getChunkSize()).isEqualTo(35); | ||
assertThat(config.getChunkOverlap()).isEqualTo(4); | ||
assertThat(config.getDelimiter()).isEqualTo(""); | ||
assertThat(config.getTrimWhitespace()).isFalse(); | ||
assertThat(config.getKeepDelimiter()).isEqualTo(Config.Delimiter.START); | ||
} | ||
|
||
} |
Oops, something went wrong.