Skip to content

Commit

Permalink
Merge pull request #10 from PabloSanchi/feature/fixed-chunker
Browse files Browse the repository at this point in the history
Feature: add fixed chunker
  • Loading branch information
PabloSanchi authored Aug 7, 2024
2 parents 790884a + 4a8a8c6 commit 89bd0ce
Show file tree
Hide file tree
Showing 9 changed files with 485 additions and 1 deletion.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ For now there is only [Pablo Sanchidrian](https://github.com/PabloSanchi) develo
Feel free to contribute!!

## ROAD MAP
- [ ] Character Chunker (NOT STARTED)
- [x] Fixed Character Chunker (DEVELOPMENT)
- [ ] Recursive Character Text Chunker (NOT STARTED)
- [ ] Document Specific Chunker (NOT STARTED)
- [x] Semantic Chunker (PRE-RELEASE)
Expand Down
42 changes: 42 additions & 0 deletions jchunk-fixed/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.github.PabloSanchi</groupId>
<artifactId>jchunk</artifactId>
<version>0.0.1-SNAPSHOT</version>
</parent>

<artifactId>jchunk-fixed</artifactId>
<packaging>jar</packaging>
<name>JChunk - Fixed Chunker</name>
<description>Fixed Chunker for Java</description>
<url>https://github.com/PabloSanchi/jchunk</url>

<scm>
<url>https://github.com/PabloSanchi/jchunk</url>
<connection>git://github.com/PabloSanchi/jchunk.git</connection>
<developerConnection>git@github.com:PabloSanchi/jchunk.git</developerConnection>
</scm>

<dependencies>
<dependency>
<groupId>com.github.PabloSanchi</groupId>
<artifactId>jchunk-core</artifactId>
<version>${project.parent.version}</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot</artifactId>
</dependency>

<!-- test -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
</dependency>
</dependencies>

</project>
114 changes: 114 additions & 0 deletions jchunk-fixed/src/main/java/jchunk/chunker/fixed/Config.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
package jchunk.chunker.fixed;

/**
* Configuration for the fixed chunker
*
* @author Pablo Sanchidrian Herrera
*/
public class Config {

private final Integer chunkSize;

private final Integer chunkOverlap;

private final String delimiter;

private final Boolean trimWhitespace;

private final Delimiter keepDelimiter;

public Integer getChunkSize() {
return chunkSize;
}

public Integer getChunkOverlap() {
return chunkOverlap;
}

public String getDelimiter() {
return delimiter;
}

public Boolean getTrimWhitespace() {
return trimWhitespace;
}

public Delimiter getKeepDelimiter() {
return keepDelimiter;
}

public Config(Integer chunkSize, Integer chunkOverlap, String delimiter, Boolean trimWhitespace,
Delimiter keepDelimiter) {
this.chunkSize = chunkSize;
this.chunkOverlap = chunkOverlap;
this.delimiter = delimiter;
this.trimWhitespace = trimWhitespace;
this.keepDelimiter = keepDelimiter;
}

/**
* {@return the default config}
*/
public static Config defaultConfig() {
return builder().build();
}

public static Builder builder() {
return new Builder();
}

public static class Builder {

private Integer chunkSize = 1000;

private Integer chunkOverlap = 100;

private String delimiter = " ";

private Boolean trimWhitespace = true;

private Delimiter keepDelimiter = Delimiter.NONE;

public Builder chunkSize(Integer chunkSize) {
this.chunkSize = chunkSize;
return this;
}

public Builder chunkOverlap(Integer chunkOverlap) {
this.chunkOverlap = chunkOverlap;
return this;
}

public Builder delimiter(String delimiter) {
this.delimiter = delimiter;
return this;
}

public Builder trimWhitespace(Boolean trimWhitespace) {
this.trimWhitespace = trimWhitespace;
return this;
}

public Builder keepDelimiter(Delimiter keepDelimiter) {
this.keepDelimiter = keepDelimiter;
return this;
}

public Config build() {
assert chunkSize > chunkOverlap : "Chunk size must be greater than chunk overlap";
return new Config(chunkSize, chunkOverlap, delimiter, trimWhitespace, keepDelimiter);
}

}

/**
* Enum to represent the delimiter configuration NONE: No delimiter START: Delimiter
* at the start of the chunk END: Delimiter at the end of the chunk
*/
public enum Delimiter {

NONE, START, END

}

}
31 changes: 31 additions & 0 deletions jchunk-fixed/src/main/java/jchunk/chunker/fixed/FixedChunker.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package jchunk.chunker.fixed;

import jchunk.chunker.core.chunk.Chunk;
import jchunk.chunker.core.chunk.IChunker;

import java.util.List;

/**
* {@link FixedChunker} is a chunker that splits the content into fixed size chunks.
*
* @author Pablo Sanchidrian Herrera
*/
public class FixedChunker implements IChunker {

private final Config config;

public FixedChunker() {
this(Config.defaultConfig());
}

public FixedChunker(Config config) {
this.config = config;
}

@Override
public List<Chunk> split(String content) {
List<String> sentences = Utils.splitIntoSentences(content, config);
return Utils.mergeSentences(sentences, config);
}

}
114 changes: 114 additions & 0 deletions jchunk-fixed/src/main/java/jchunk/chunker/fixed/Utils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
package jchunk.chunker.fixed;

import jchunk.chunker.core.chunk.Chunk;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Logger;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

public class Utils {

private static final Logger logger = Logger.getLogger(Utils.class.getName());

public static final String LONGER_THAN_THE_SPECIFIED_ = "Created a chunk of size %d, which is longer than the specified %d";

/**
* Splits the content into sentences using the delimiter.
* @param content the content to split
* @param config configuration for the chunker/splitter
* @return a list of split sentences
*/
public static List<String> splitIntoSentences(String content, Config config) {
String delimiter = config.getDelimiter();
Config.Delimiter keepDelimiter = config.getKeepDelimiter();

if (delimiter.isBlank()) {
return content.chars().mapToObj(c -> String.valueOf((char) c)).collect(Collectors.toList());
}

if (keepDelimiter != Config.Delimiter.NONE) {
String withDelimiter = "((?<=%1$s)|(?=%1$s))";
List<String> preSplits = new ArrayList<>(List.of(content.split(String.format(withDelimiter, delimiter))));
List<String> splits = new ArrayList<>();

if (keepDelimiter == Config.Delimiter.START) {
splits.add(preSplits.getFirst());
IntStream.range(1, preSplits.size())
.filter(i -> i % 2 == 1)
.forEach(i -> splits.add(preSplits.get(i).concat(preSplits.get(i + 1))));
}
else {
IntStream.range(0, preSplits.size() - 1)
.filter(i -> i % 2 == 0)
.forEach(i -> splits.add(preSplits.get(i).concat(preSplits.get(i + 1))));
splits.add(preSplits.getLast());
}

return splits.stream().filter(s -> !s.isBlank()).toList();
}

return Arrays.stream(content.split(delimiter)).filter(s -> !s.isBlank()).toList();
}

/**
* Merges the sentences into chunks.
* @param sentences the sentences to merge
* @param config configuration for the chunker/splitter
* @return list of chunks
*/
static List<Chunk> mergeSentences(List<String> sentences, Config config) {
String delimiter = config.getDelimiter();
Integer chunkSize = config.getChunkSize();
Integer chunkOverlap = config.getChunkOverlap();
Boolean trimWhitespace = config.getTrimWhitespace();

int currentLen = 0;
int delimiterLen = delimiter.length();

List<Chunk> chunks = new ArrayList<>();
List<String> currentChunk = new ArrayList<>();

AtomicInteger chunkIndex = new AtomicInteger(0);

for (String sentence : sentences) {
int sentenceLength = sentence.length();

if (currentLen + sentenceLength + (currentChunk.isEmpty() ? 0 : delimiterLen) > chunkSize) {
if (currentLen > chunkSize) {
logger.warning(String.format(LONGER_THAN_THE_SPECIFIED_, currentLen, config.getChunkSize()));
}

if (!currentChunk.isEmpty()) {
String generatedSentence = joinSentences(currentChunk, delimiter, trimWhitespace);
chunks.add(new Chunk(chunkIndex.getAndIncrement(), generatedSentence));

while (currentLen > chunkOverlap
|| (currentLen + sentenceLength + (currentChunk.isEmpty() ? 0 : delimiterLen) > chunkSize
&& currentLen > 0)) {
currentLen -= currentChunk.removeFirst().length() + (currentChunk.isEmpty() ? 0 : delimiterLen);
}
}
}

currentChunk.add(sentence);
currentLen += sentenceLength + (currentChunk.size() > 1 ? delimiterLen : 0);
}

if (!currentChunk.isEmpty()) {
String generatedSentence = joinSentences(currentChunk, config.getDelimiter(), config.getTrimWhitespace());
chunks.add(new Chunk(chunkIndex.getAndIncrement(), generatedSentence));
}

return chunks;
}

private static String joinSentences(List<String> sentences, String delimiter, Boolean trimWhitespace) {
String generatedSentence = String.join(delimiter, sentences);
return trimWhitespace ? generatedSentence.trim() : generatedSentence;
}

}
37 changes: 37 additions & 0 deletions jchunk-fixed/src/test/java/jchunk/chunker/fixed/ConfigTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package jchunk.chunker.fixed;

import org.junit.jupiter.api.Test;

import static org.assertj.core.api.Assertions.assertThat;

public class ConfigTest {

@Test
public void testDefaultConfig() {
Config config = Config.builder().build();

assertThat(config.getChunkSize()).isEqualTo(1000);
assertThat(config.getChunkOverlap()).isEqualTo(100);
assertThat(config.getDelimiter()).isEqualTo(" ");
assertThat(config.getTrimWhitespace()).isTrue();
assertThat(config.getKeepDelimiter()).isEqualTo(Config.Delimiter.NONE);
}

@Test
public void testConfigBuilder() {
Config config = Config.builder()
.chunkSize(35)
.chunkOverlap(4)
.delimiter("")
.trimWhitespace(false)
.keepDelimiter(Config.Delimiter.START)
.build();

assertThat(config.getChunkSize()).isEqualTo(35);
assertThat(config.getChunkOverlap()).isEqualTo(4);
assertThat(config.getDelimiter()).isEqualTo("");
assertThat(config.getTrimWhitespace()).isFalse();
assertThat(config.getKeepDelimiter()).isEqualTo(Config.Delimiter.START);
}

}
Loading

0 comments on commit 89bd0ce

Please sign in to comment.