From 4913e3d818f22fe4117dd60e84be5ac285043db9 Mon Sep 17 00:00:00 2001 From: PabloSanchi Date: Wed, 7 Aug 2024 12:25:44 +0200 Subject: [PATCH 01/14] feat: add jchun fixed chunker module, setup --- jchunk-fixed/pom.xml | 42 ++++++++++++++++++++++++++++++++++++++++++ pom.xml | 1 + 2 files changed, 43 insertions(+) create mode 100644 jchunk-fixed/pom.xml diff --git a/jchunk-fixed/pom.xml b/jchunk-fixed/pom.xml new file mode 100644 index 0000000..bd48d17 --- /dev/null +++ b/jchunk-fixed/pom.xml @@ -0,0 +1,42 @@ + + + 4.0.0 + + com.github.PabloSanchi + jchunk + 0.0.1-SNAPSHOT + + + jchunk-fixed + jar + JChunk - Fixed Chunker + Fixed Chunker for Java + https://github.com/PabloSanchi/jchunk + + + https://github.com/PabloSanchi/jchunk + git://github.com/PabloSanchi/jchunk.git + git@github.com:PabloSanchi/jchunk.git + + + + + com.github.PabloSanchi + jchunk-core + ${project.parent.version} + + + org.springframework.boot + spring-boot + + + + + org.springframework.boot + spring-boot-starter-test + + + + \ No newline at end of file diff --git a/pom.xml b/pom.xml index f8e37ca..86a4ae3 100644 --- a/pom.xml +++ b/pom.xml @@ -16,6 +16,7 @@ jchunk-semantic jchunk-core + jchunk-fixed From 98546ca8a1050df5ae5c1d4427a3a4ef950bad4c Mon Sep 17 00:00:00 2001 From: PabloSanchi Date: Wed, 7 Aug 2024 12:26:31 +0200 Subject: [PATCH 02/14] feat: add fixed chunker tests and basic config --- .../java/jchunk/chunker/fixed/Config.java | 75 +++++++++++++++++++ .../jchunk/chunker/fixed/FixedChunker.java | 30 ++++++++ .../jchunk/chunker/fixed/FixedChunkerIT.java | 46 ++++++++++++ 3 files changed, 151 insertions(+) create mode 100644 jchunk-fixed/src/main/java/jchunk/chunker/fixed/Config.java create mode 100644 jchunk-fixed/src/main/java/jchunk/chunker/fixed/FixedChunker.java create mode 100644 jchunk-fixed/src/test/java/jchunk/chunker/fixed/FixedChunkerIT.java diff --git a/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Config.java b/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Config.java new file mode 100644 index 0000000..eb74796 --- /dev/null +++ b/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Config.java @@ -0,0 +1,75 @@ +package jchunk.chunker.fixed; + +/** + * Configuration for the fixed chunker + * + * @author Pablo Sanchidrian Herrera + */ +public class Config { + + private Integer chunkSize; + + private Integer chunkOverlap; + + private String separator; + + public Integer getChunkSize() { + return chunkSize; + } + + public Integer getChunkOverlap() { + return chunkOverlap; + } + + public String getSeparator() { + return separator; + } + + public Config(Integer chunkSize, Integer chunkOverlap, String separator) { + this.chunkSize = chunkSize; + this.chunkOverlap = chunkOverlap; + this.separator = separator; + } + + /** + * {@return the default config} + */ + public static Config defaultConfig() { + return builder().build(); + } + + public static Builder builder() { + return new Builder(); + } + + public static class Builder { + + private Integer chunkSize = 1000; + + private Integer chunkOverlap = 100; + + private String separator = " "; + + public Builder chunkSize(Integer chunkSize) { + this.chunkSize = chunkSize; + return this; + } + + public Builder chunkOverlap(Integer chunkOverlap) { + this.chunkOverlap = chunkOverlap; + return this; + } + + public Builder separator(String separator) { + this.separator = separator; + return this; + } + + public Config build() { + assert chunkSize > chunkOverlap : "Chunk size must be greater than chunk overlap"; + return new Config(chunkSize, chunkOverlap, separator); + } + + } + +} diff --git a/jchunk-fixed/src/main/java/jchunk/chunker/fixed/FixedChunker.java b/jchunk-fixed/src/main/java/jchunk/chunker/fixed/FixedChunker.java new file mode 100644 index 0000000..1c7d844 --- /dev/null +++ b/jchunk-fixed/src/main/java/jchunk/chunker/fixed/FixedChunker.java @@ -0,0 +1,30 @@ +package jchunk.chunker.fixed; + +import jchunk.chunker.core.chunk.Chunk; +import jchunk.chunker.core.chunk.IChunker; + +import java.util.List; + +/** + * {@link FixedChunker} is a chunker that splits the content into fixed size chunks. + * + * @author Pablo Sanchidrian Herrera + */ +public class FixedChunker implements IChunker { + + private final Config config; + + public FixedChunker() { + this(Config.defaultConfig()); + } + + public FixedChunker(Config config) { + this.config = config; + } + + @Override + public List split(String content) { + return null; + } + +} diff --git a/jchunk-fixed/src/test/java/jchunk/chunker/fixed/FixedChunkerIT.java b/jchunk-fixed/src/test/java/jchunk/chunker/fixed/FixedChunkerIT.java new file mode 100644 index 0000000..abc1352 --- /dev/null +++ b/jchunk-fixed/src/test/java/jchunk/chunker/fixed/FixedChunkerIT.java @@ -0,0 +1,46 @@ +package jchunk.chunker.fixed; + +import jchunk.chunker.core.chunk.Chunk; +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +public class FixedChunkerIT { + + private FixedChunker chunker; + + private static String CONTENT = """ + This is the text I would like to chunk up. It is the example text for this exercise + """; + + @Test + public void testSplitWithDefaultConfig() { + chunker = new FixedChunker(); + List expectedChunks = List + .of(new Chunk(0, "This is the text I would like to chunk up. It is the example text for this exercise")); + + List chunks = chunker.split(CONTENT); + + assertThat(chunks).isNotNull(); + assertThat(chunks.size()).isEqualTo(1); + } + + @Test + public void testSplitWithCustomConfig() { + Config config = Config.builder().chunkSize(35).chunkOverlap(4).separator("").build(); + + chunker = new FixedChunker(config); + + List expectedChunks = List.of(new Chunk(0, "This is the text I would like to ch"), + new Chunk(1, "o chunk up. It is the example text"), new Chunk(2, "ext for this exercise")); + + List chunks = chunker.split(CONTENT); + + assertThat(chunks).isNotNull(); + assertThat(chunks.size()).isEqualTo(3); + assertThat(chunks).containsExactlyElementsOf(expectedChunks); + } + +} From 044e3ef4fb5f4363fe634e53cf00480085333633 Mon Sep 17 00:00:00 2001 From: PabloSanchi Date: Wed, 7 Aug 2024 12:31:26 +0200 Subject: [PATCH 03/14] feat: add trimWhitespace config attribute --- .../main/java/jchunk/chunker/fixed/Config.java | 18 ++++++++++++++++-- .../jchunk/chunker/fixed/FixedChunkerIT.java | 16 ++++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Config.java b/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Config.java index eb74796..eab8e05 100644 --- a/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Config.java +++ b/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Config.java @@ -13,6 +13,8 @@ public class Config { private String separator; + private Boolean trimWhitespace; + public Integer getChunkSize() { return chunkSize; } @@ -25,10 +27,15 @@ public String getSeparator() { return separator; } - public Config(Integer chunkSize, Integer chunkOverlap, String separator) { + public Boolean getTrimWhitespace() { + return trimWhitespace; + } + + public Config(Integer chunkSize, Integer chunkOverlap, String separator, Boolean trimWhitespace) { this.chunkSize = chunkSize; this.chunkOverlap = chunkOverlap; this.separator = separator; + this.trimWhitespace = trimWhitespace; } /** @@ -50,6 +57,8 @@ public static class Builder { private String separator = " "; + private Boolean trimWhitespace = true; + public Builder chunkSize(Integer chunkSize) { this.chunkSize = chunkSize; return this; @@ -65,9 +74,14 @@ public Builder separator(String separator) { return this; } + public Builder trimWhitespace(Boolean trimWhitespace) { + this.trimWhitespace = trimWhitespace; + return this; + } + public Config build() { assert chunkSize > chunkOverlap : "Chunk size must be greater than chunk overlap"; - return new Config(chunkSize, chunkOverlap, separator); + return new Config(chunkSize, chunkOverlap, separator, trimWhitespace); } } diff --git a/jchunk-fixed/src/test/java/jchunk/chunker/fixed/FixedChunkerIT.java b/jchunk-fixed/src/test/java/jchunk/chunker/fixed/FixedChunkerIT.java index abc1352..7566777 100644 --- a/jchunk-fixed/src/test/java/jchunk/chunker/fixed/FixedChunkerIT.java +++ b/jchunk-fixed/src/test/java/jchunk/chunker/fixed/FixedChunkerIT.java @@ -43,4 +43,20 @@ public void testSplitWithCustomConfig() { assertThat(chunks).containsExactlyElementsOf(expectedChunks); } + @Test + public void testSplitWithCustomConfigNoWhiteSpace() { + Config config = Config.builder().chunkSize(35).chunkOverlap(4).separator("").trimWhitespace(false).build(); + + chunker = new FixedChunker(config); + + List expectedChunks = List.of(new Chunk(0, "This is the text I would like to ch"), + new Chunk(1, "unk up. It is the example text for "), new Chunk(2, "this exercise")); + + List chunks = chunker.split(CONTENT); + + assertThat(chunks).isNotNull(); + assertThat(chunks.size()).isEqualTo(3); + assertThat(chunks).containsExactlyElementsOf(expectedChunks); + } + } From 64ab301a6347fbada1a142b029a16f0ef4e3b078 Mon Sep 17 00:00:00 2001 From: PabloSanchi Date: Wed, 7 Aug 2024 12:33:48 +0200 Subject: [PATCH 04/14] feat: add Config class unit tests --- .../java/jchunk/chunker/fixed/ConfigTest.java | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 jchunk-fixed/src/test/java/jchunk/chunker/fixed/ConfigTest.java diff --git a/jchunk-fixed/src/test/java/jchunk/chunker/fixed/ConfigTest.java b/jchunk-fixed/src/test/java/jchunk/chunker/fixed/ConfigTest.java new file mode 100644 index 0000000..28633a7 --- /dev/null +++ b/jchunk-fixed/src/test/java/jchunk/chunker/fixed/ConfigTest.java @@ -0,0 +1,29 @@ +package jchunk.chunker.fixed; + +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +public class ConfigTest { + + @Test + public void testDefaultConfig() { + Config config = Config.builder().build(); + + assertThat(config.getChunkSize()).isEqualTo(1000); + assertThat(config.getChunkOverlap()).isEqualTo(100); + assertThat(config.getSeparator()).isEqualTo(" "); + assertThat(config.getTrimWhitespace()).isTrue(); + } + + @Test + public void testConfigBuilder() { + Config config = Config.builder().chunkSize(35).chunkOverlap(4).separator("").trimWhitespace(false).build(); + + assertThat(config.getChunkSize()).isEqualTo(35); + assertThat(config.getChunkOverlap()).isEqualTo(4); + assertThat(config.getSeparator()).isEqualTo(""); + assertThat(config.getTrimWhitespace()).isFalse(); + } + +} From ff86f780a7d8982548560764de8f6748d18b7eb1 Mon Sep 17 00:00:00 2001 From: PabloSanchi Date: Wed, 7 Aug 2024 13:39:03 +0200 Subject: [PATCH 05/14] feat: add keep delimiter config attribute --- .../java/jchunk/chunker/fixed/Config.java | 43 +++++++++++++++---- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Config.java b/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Config.java index eab8e05..c4bb2b8 100644 --- a/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Config.java +++ b/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Config.java @@ -7,13 +7,15 @@ */ public class Config { - private Integer chunkSize; + private final Integer chunkSize; - private Integer chunkOverlap; + private final Integer chunkOverlap; - private String separator; + private final String delimiter; - private Boolean trimWhitespace; + private final Boolean trimWhitespace; + + private final Delimiter keepDelimiter; public Integer getChunkSize() { return chunkSize; @@ -23,19 +25,25 @@ public Integer getChunkOverlap() { return chunkOverlap; } - public String getSeparator() { - return separator; + public String getDelimiter() { + return delimiter; } public Boolean getTrimWhitespace() { return trimWhitespace; } - public Config(Integer chunkSize, Integer chunkOverlap, String separator, Boolean trimWhitespace) { + public Delimiter getKeepDelimiter() { + return keepDelimiter; + } + + public Config(Integer chunkSize, Integer chunkOverlap, String delimiter, Boolean trimWhitespace, + Delimiter keepDelimiter) { this.chunkSize = chunkSize; this.chunkOverlap = chunkOverlap; - this.separator = separator; + this.delimiter = delimiter; this.trimWhitespace = trimWhitespace; + this.keepDelimiter = keepDelimiter; } /** @@ -59,6 +67,8 @@ public static class Builder { private Boolean trimWhitespace = true; + private Delimiter keepDelimiter = Delimiter.NONE; + public Builder chunkSize(Integer chunkSize) { this.chunkSize = chunkSize; return this; @@ -79,11 +89,26 @@ public Builder trimWhitespace(Boolean trimWhitespace) { return this; } + public Builder keepDelimiter(Delimiter keepDelimiter) { + this.keepDelimiter = keepDelimiter; + return this; + } + public Config build() { assert chunkSize > chunkOverlap : "Chunk size must be greater than chunk overlap"; - return new Config(chunkSize, chunkOverlap, separator, trimWhitespace); + return new Config(chunkSize, chunkOverlap, separator, trimWhitespace, keepDelimiter); } } + /** + * Enum to represent the delimiter configuration NONE: No delimiter START: Delimiter + * at the start of the chunk END: Delimiter at the end of the chunk + */ + public enum Delimiter { + + NONE, START, END + + } + } From ace30fae11148e5f0b0c6c42fa592104d04502ef Mon Sep 17 00:00:00 2001 From: PabloSanchi Date: Wed, 7 Aug 2024 13:39:16 +0200 Subject: [PATCH 06/14] feat: update tests --- .../test/java/jchunk/chunker/fixed/ConfigTest.java | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/jchunk-fixed/src/test/java/jchunk/chunker/fixed/ConfigTest.java b/jchunk-fixed/src/test/java/jchunk/chunker/fixed/ConfigTest.java index 28633a7..4627242 100644 --- a/jchunk-fixed/src/test/java/jchunk/chunker/fixed/ConfigTest.java +++ b/jchunk-fixed/src/test/java/jchunk/chunker/fixed/ConfigTest.java @@ -12,18 +12,26 @@ public void testDefaultConfig() { assertThat(config.getChunkSize()).isEqualTo(1000); assertThat(config.getChunkOverlap()).isEqualTo(100); - assertThat(config.getSeparator()).isEqualTo(" "); + assertThat(config.getDelimiter()).isEqualTo(" "); assertThat(config.getTrimWhitespace()).isTrue(); + assertThat(config.getKeepDelimiter()).isEqualTo(Config.Delimiter.NONE); } @Test public void testConfigBuilder() { - Config config = Config.builder().chunkSize(35).chunkOverlap(4).separator("").trimWhitespace(false).build(); + Config config = Config.builder() + .chunkSize(35) + .chunkOverlap(4) + .separator("") + .trimWhitespace(false) + .keepDelimiter(Config.Delimiter.START) + .build(); assertThat(config.getChunkSize()).isEqualTo(35); assertThat(config.getChunkOverlap()).isEqualTo(4); - assertThat(config.getSeparator()).isEqualTo(""); + assertThat(config.getDelimiter()).isEqualTo(""); assertThat(config.getTrimWhitespace()).isFalse(); + assertThat(config.getKeepDelimiter()).isEqualTo(Config.Delimiter.START); } } From 4e829c845bb97ba22317f00faba0b7eb60ce822e Mon Sep 17 00:00:00 2001 From: PabloSanchi Date: Wed, 7 Aug 2024 17:41:33 +0200 Subject: [PATCH 07/14] feat: add fixed chunker utils --- .../main/java/jchunk/chunker/fixed/Utils.java | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 jchunk-fixed/src/main/java/jchunk/chunker/fixed/Utils.java diff --git a/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Utils.java b/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Utils.java new file mode 100644 index 0000000..7f58e24 --- /dev/null +++ b/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Utils.java @@ -0,0 +1,53 @@ +package jchunk.chunker.fixed; + +import jchunk.chunker.core.chunk.Chunk; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.IntStream; + +public class Utils { + + /** + * Splits the content into sentences using the delimiter. + * @param content the content to split + * @return a list of split sentences + */ + public static List splitIntoSentences(String content, String delimiter, Config.Delimiter keepDelimiter) { + + if (keepDelimiter != Config.Delimiter.NONE) { + String withDelimiter = "((?<=%1$s)|(?=%1$s))"; + List preSplits = new ArrayList<>(List.of(content.split(String.format(withDelimiter, delimiter)))); + List splits = new ArrayList<>(); + + if (keepDelimiter == Config.Delimiter.START) { + splits.add(preSplits.getFirst()); + IntStream.range(1, preSplits.size()) + .filter(i -> i % 2 == 1) + .forEach(i -> splits.add(preSplits.get(i).concat(preSplits.get(i + 1)))); + } + else { + IntStream.range(0, preSplits.size() - 1) + .filter(i -> i % 2 == 0) + .forEach(i -> splits.add(preSplits.get(i).concat(preSplits.get(i + 1)))); + splits.add(preSplits.getLast()); + } + + return splits.stream().filter(s -> !s.isBlank()).map(String::trim).toList(); + } + + return Arrays.stream(content.split(delimiter)).filter(s -> !s.isBlank()).map(String::trim).toList(); + } + + /** + * Merges the sentences into chunks. + * @param sentences the sentences to merge + * @param delimiter the delimiter to use + * @return list of chunks + */ + List mergeSentences(List sentences, String delimiter) { + return null; + } + +} From ac7c29cc9181006b8f004c29034fc7ad51ce98e1 Mon Sep 17 00:00:00 2001 From: PabloSanchi Date: Wed, 7 Aug 2024 18:31:42 +0200 Subject: [PATCH 08/14] fix: issue with spected chunks size --- .../jchunk/chunker/fixed/FixedChunkerIT.java | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/jchunk-fixed/src/test/java/jchunk/chunker/fixed/FixedChunkerIT.java b/jchunk-fixed/src/test/java/jchunk/chunker/fixed/FixedChunkerIT.java index 7566777..ec3343a 100644 --- a/jchunk-fixed/src/test/java/jchunk/chunker/fixed/FixedChunkerIT.java +++ b/jchunk-fixed/src/test/java/jchunk/chunker/fixed/FixedChunkerIT.java @@ -11,9 +11,7 @@ public class FixedChunkerIT { private FixedChunker chunker; - private static String CONTENT = """ - This is the text I would like to chunk up. It is the example text for this exercise - """; + private static final String CONTENT = "This is the text I would like to chunk up. It is the example text for this exercise"; @Test public void testSplitWithDefaultConfig() { @@ -45,7 +43,7 @@ public void testSplitWithCustomConfig() { @Test public void testSplitWithCustomConfigNoWhiteSpace() { - Config config = Config.builder().chunkSize(35).chunkOverlap(4).separator("").trimWhitespace(false).build(); + Config config = Config.builder().chunkSize(35).chunkOverlap(0).separator("").trimWhitespace(false).build(); chunker = new FixedChunker(config); @@ -59,4 +57,26 @@ public void testSplitWithCustomConfigNoWhiteSpace() { assertThat(chunks).containsExactlyElementsOf(expectedChunks); } + @Test + public void testSplitWithCustomConfigWithKeepDelimiterSetToNone() { + Config config = Config.builder() + .chunkSize(35) + .chunkOverlap(0) + .separator("ch") + .trimWhitespace(true) + .keepDelimiter(Config.Delimiter.NONE) + .build(); + + chunker = new FixedChunker(config); + + List expectedChunks = List.of(new Chunk(0, "This is the text I would like to"), + new Chunk(1, "unk up. It is the example text for this exercise")); + + List chunks = chunker.split(CONTENT); + + assertThat(chunks).isNotNull(); + assertThat(chunks.size()).isEqualTo(2); + assertThat(chunks).containsExactlyElementsOf(expectedChunks); + } + } From 8f97f992e81a165f055e6eef48efbe1e45a1e266 Mon Sep 17 00:00:00 2001 From: PabloSanchi Date: Wed, 7 Aug 2024 18:32:17 +0200 Subject: [PATCH 09/14] feat: merge sentences utility --- .../main/java/jchunk/chunker/fixed/Utils.java | 77 +++++++++++++++++-- 1 file changed, 71 insertions(+), 6 deletions(-) diff --git a/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Utils.java b/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Utils.java index 7f58e24..4113fe1 100644 --- a/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Utils.java +++ b/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Utils.java @@ -5,16 +5,32 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.logging.Logger; +import java.util.stream.Collectors; import java.util.stream.IntStream; public class Utils { + private static final Logger logger = Logger.getLogger(Utils.class.getName()); + + public static final String LONGER_THAN_THE_SPECIFIED_ = "Created a chunk of size %d, which is longer than the specified %d"; + /** * Splits the content into sentences using the delimiter. * @param content the content to split + * @param config configuration for the chunker/splitter * @return a list of split sentences */ - public static List splitIntoSentences(String content, String delimiter, Config.Delimiter keepDelimiter) { + public static List splitIntoSentences(String content, Config config) { + String delimiter = config.getDelimiter(); + Config.Delimiter keepDelimiter = config.getKeepDelimiter(); + + if(delimiter.isBlank()) { + return content.chars() + .mapToObj(c -> String.valueOf((char) c)) + .collect(Collectors.toList()); + } if (keepDelimiter != Config.Delimiter.NONE) { String withDelimiter = "((?<=%1$s)|(?=%1$s))"; @@ -34,20 +50,69 @@ public static List splitIntoSentences(String content, String delimiter, splits.add(preSplits.getLast()); } - return splits.stream().filter(s -> !s.isBlank()).map(String::trim).toList(); + return splits.stream().filter(s -> !s.isBlank()).toList(); } - return Arrays.stream(content.split(delimiter)).filter(s -> !s.isBlank()).map(String::trim).toList(); + return Arrays.stream(content.split(delimiter)).filter(s -> !s.isBlank()).toList(); } /** * Merges the sentences into chunks. * @param sentences the sentences to merge - * @param delimiter the delimiter to use + * @param config configuration for the chunker/splitter * @return list of chunks */ - List mergeSentences(List sentences, String delimiter) { - return null; + static List mergeSentences(List sentences, Config config) { + String delimiter = config.getDelimiter(); + Integer chunkSize = config.getChunkSize(); + Integer chunkOverlap = config.getChunkOverlap(); + Boolean trimWhitespace = config.getTrimWhitespace(); + + int currentLen = 0; + int delimiterLen = delimiter.length(); + + List chunks = new ArrayList<>(); + List currentChunk = new ArrayList<>(); + + AtomicInteger chunkIndex = new AtomicInteger(0); + + + for (String sentence : sentences) { + int sentenceLength = sentence.length(); + + if (currentLen + sentenceLength + (currentChunk.isEmpty() ? 0 : delimiterLen) > chunkSize) { + if (currentLen > chunkSize) { + logger.warning(String.format(LONGER_THAN_THE_SPECIFIED_, currentLen, config.getChunkSize())); + } + + if (!currentChunk.isEmpty()) { + String generatedSentence = joinSentences(currentChunk, delimiter, trimWhitespace); + chunks.add(new Chunk(chunkIndex.getAndIncrement(), generatedSentence)); + + while ( + currentLen > chunkOverlap || + (currentLen + sentenceLength + (currentChunk.isEmpty() ? 0 : delimiterLen) > chunkSize && currentLen > 0) + ) { + currentLen -= currentChunk.removeFirst().length() + (currentChunk.isEmpty() ? 0 : delimiterLen); + } + } + } + + currentChunk.add(sentence); + currentLen += sentenceLength + (currentChunk.size() > 1 ? delimiterLen : 0); + } + + if (!currentChunk.isEmpty()) { + String generatedSentence = joinSentences(currentChunk, config.getDelimiter(), config.getTrimWhitespace()); + chunks.add(new Chunk(chunkIndex.getAndIncrement(), generatedSentence)); + } + + return chunks; + } + + private static String joinSentences(List sentences, String delimiter, Boolean trimWhitespace) { + String generatedSentence = String.join(delimiter, sentences); + return trimWhitespace ? generatedSentence.trim() : generatedSentence; } } From 242be893dc22195d0e473803e855f3d41d77d664 Mon Sep 17 00:00:00 2001 From: PabloSanchi Date: Wed, 7 Aug 2024 18:32:45 +0200 Subject: [PATCH 10/14] feat: implement split method --- .../src/main/java/jchunk/chunker/fixed/FixedChunker.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/jchunk-fixed/src/main/java/jchunk/chunker/fixed/FixedChunker.java b/jchunk-fixed/src/main/java/jchunk/chunker/fixed/FixedChunker.java index 1c7d844..e4a9186 100644 --- a/jchunk-fixed/src/main/java/jchunk/chunker/fixed/FixedChunker.java +++ b/jchunk-fixed/src/main/java/jchunk/chunker/fixed/FixedChunker.java @@ -24,7 +24,8 @@ public FixedChunker(Config config) { @Override public List split(String content) { - return null; + List sentences = Utils.splitIntoSentences(content, config); + return Utils.mergeSentences(sentences, config); } } From 771057e30f97032b57996c4adc68887439fecc9f Mon Sep 17 00:00:00 2001 From: PabloSanchi Date: Wed, 7 Aug 2024 18:34:53 +0200 Subject: [PATCH 11/14] feat: update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bae1a39..999bda0 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ For now there is only [Pablo Sanchidrian](https://github.com/PabloSanchi) develo Feel free to contribute!! ## ROAD MAP -- [ ] Character Chunker (NOT STARTED) +- [x] Fixed Character Chunker (DEVELOPMENT) - [ ] Recursive Character Text Chunker (NOT STARTED) - [ ] Document Specific Chunker (NOT STARTED) - [x] Semantic Chunker (PRE-RELEASE) From 1beb4e87c1661ead34bd034eb50ced009db165f0 Mon Sep 17 00:00:00 2001 From: PabloSanchi Date: Wed, 7 Aug 2024 18:36:47 +0200 Subject: [PATCH 12/14] fix: lint --- .../src/main/java/jchunk/chunker/fixed/Utils.java | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Utils.java b/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Utils.java index 4113fe1..e2f50b4 100644 --- a/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Utils.java +++ b/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Utils.java @@ -26,10 +26,8 @@ public static List splitIntoSentences(String content, Config config) { String delimiter = config.getDelimiter(); Config.Delimiter keepDelimiter = config.getKeepDelimiter(); - if(delimiter.isBlank()) { - return content.chars() - .mapToObj(c -> String.valueOf((char) c)) - .collect(Collectors.toList()); + if (delimiter.isBlank()) { + return content.chars().mapToObj(c -> String.valueOf((char) c)).collect(Collectors.toList()); } if (keepDelimiter != Config.Delimiter.NONE) { @@ -76,7 +74,6 @@ static List mergeSentences(List sentences, Config config) { AtomicInteger chunkIndex = new AtomicInteger(0); - for (String sentence : sentences) { int sentenceLength = sentence.length(); @@ -89,10 +86,9 @@ static List mergeSentences(List sentences, Config config) { String generatedSentence = joinSentences(currentChunk, delimiter, trimWhitespace); chunks.add(new Chunk(chunkIndex.getAndIncrement(), generatedSentence)); - while ( - currentLen > chunkOverlap || - (currentLen + sentenceLength + (currentChunk.isEmpty() ? 0 : delimiterLen) > chunkSize && currentLen > 0) - ) { + while (currentLen > chunkOverlap + || (currentLen + sentenceLength + (currentChunk.isEmpty() ? 0 : delimiterLen) > chunkSize + && currentLen > 0)) { currentLen -= currentChunk.removeFirst().length() + (currentChunk.isEmpty() ? 0 : delimiterLen); } } From 57f71ff90767f76f035cfc00071f54ac28d45956 Mon Sep 17 00:00:00 2001 From: PabloSanchi Date: Wed, 7 Aug 2024 18:44:30 +0200 Subject: [PATCH 13/14] feat: rename separator to delimiter --- .../src/main/java/jchunk/chunker/fixed/Config.java | 8 ++++---- .../src/test/java/jchunk/chunker/fixed/ConfigTest.java | 2 +- .../test/java/jchunk/chunker/fixed/FixedChunkerIT.java | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Config.java b/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Config.java index c4bb2b8..ae69638 100644 --- a/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Config.java +++ b/jchunk-fixed/src/main/java/jchunk/chunker/fixed/Config.java @@ -63,7 +63,7 @@ public static class Builder { private Integer chunkOverlap = 100; - private String separator = " "; + private String delimiter = " "; private Boolean trimWhitespace = true; @@ -79,8 +79,8 @@ public Builder chunkOverlap(Integer chunkOverlap) { return this; } - public Builder separator(String separator) { - this.separator = separator; + public Builder delimiter(String delimiter) { + this.delimiter = delimiter; return this; } @@ -96,7 +96,7 @@ public Builder keepDelimiter(Delimiter keepDelimiter) { public Config build() { assert chunkSize > chunkOverlap : "Chunk size must be greater than chunk overlap"; - return new Config(chunkSize, chunkOverlap, separator, trimWhitespace, keepDelimiter); + return new Config(chunkSize, chunkOverlap, delimiter, trimWhitespace, keepDelimiter); } } diff --git a/jchunk-fixed/src/test/java/jchunk/chunker/fixed/ConfigTest.java b/jchunk-fixed/src/test/java/jchunk/chunker/fixed/ConfigTest.java index 4627242..5aad9a1 100644 --- a/jchunk-fixed/src/test/java/jchunk/chunker/fixed/ConfigTest.java +++ b/jchunk-fixed/src/test/java/jchunk/chunker/fixed/ConfigTest.java @@ -22,7 +22,7 @@ public void testConfigBuilder() { Config config = Config.builder() .chunkSize(35) .chunkOverlap(4) - .separator("") + .delimiter("") .trimWhitespace(false) .keepDelimiter(Config.Delimiter.START) .build(); diff --git a/jchunk-fixed/src/test/java/jchunk/chunker/fixed/FixedChunkerIT.java b/jchunk-fixed/src/test/java/jchunk/chunker/fixed/FixedChunkerIT.java index ec3343a..7ac1733 100644 --- a/jchunk-fixed/src/test/java/jchunk/chunker/fixed/FixedChunkerIT.java +++ b/jchunk-fixed/src/test/java/jchunk/chunker/fixed/FixedChunkerIT.java @@ -27,7 +27,7 @@ public void testSplitWithDefaultConfig() { @Test public void testSplitWithCustomConfig() { - Config config = Config.builder().chunkSize(35).chunkOverlap(4).separator("").build(); + Config config = Config.builder().chunkSize(35).chunkOverlap(4).delimiter("").build(); chunker = new FixedChunker(config); @@ -43,7 +43,7 @@ public void testSplitWithCustomConfig() { @Test public void testSplitWithCustomConfigNoWhiteSpace() { - Config config = Config.builder().chunkSize(35).chunkOverlap(0).separator("").trimWhitespace(false).build(); + Config config = Config.builder().chunkSize(35).chunkOverlap(0).delimiter("").trimWhitespace(false).build(); chunker = new FixedChunker(config); @@ -62,7 +62,7 @@ public void testSplitWithCustomConfigWithKeepDelimiterSetToNone() { Config config = Config.builder() .chunkSize(35) .chunkOverlap(0) - .separator("ch") + .delimiter("ch") .trimWhitespace(true) .keepDelimiter(Config.Delimiter.NONE) .build(); From 4a8a8c667b48ff6b14f7a7bb0f9bbaabae1f8eb3 Mon Sep 17 00:00:00 2001 From: PabloSanchi Date: Wed, 7 Aug 2024 18:48:25 +0200 Subject: [PATCH 14/14] feat: add utils unit tests --- .../java/jchunk/chunker/fixed/UtilsTest.java | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 jchunk-fixed/src/test/java/jchunk/chunker/fixed/UtilsTest.java diff --git a/jchunk-fixed/src/test/java/jchunk/chunker/fixed/UtilsTest.java b/jchunk-fixed/src/test/java/jchunk/chunker/fixed/UtilsTest.java new file mode 100644 index 0000000..83a0b9a --- /dev/null +++ b/jchunk-fixed/src/test/java/jchunk/chunker/fixed/UtilsTest.java @@ -0,0 +1,63 @@ +package jchunk.chunker.fixed; + +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +public class UtilsTest { + + private static final String CONTENT = "This is the text I would like to chunk up. It is the example text for this exercise"; + + @Test + public void testSplitIntoSentencesWithBlanckSeparator() { + Config config = Config.builder().delimiter("").build(); + + List sentences = Utils.splitIntoSentences(CONTENT, config); + + assertThat(sentences).isNotNull(); + assertThat(sentences).hasSize(CONTENT.length()); + + for (int i = 0; i < CONTENT.length(); i++) { + assertThat(sentences.get(i)).isEqualTo(String.valueOf(CONTENT.charAt(i))); + } + } + + @Test + public void testSplitIntoSentencesWithNoDelimiter() { + Config config = Config.builder().delimiter("ch").build(); + + List sentences = Utils.splitIntoSentences(CONTENT, config); + + assertThat(sentences).isNotNull(); + assertThat(sentences).hasSize(2); + assertThat(sentences.getFirst()).isEqualTo("This is the text I would like to "); + assertThat(sentences.getLast()).isEqualTo("unk up. It is the example text for this exercise"); + } + + @Test + public void testSplitIntoSentencesWithDelimiterStart() { + Config config = Config.builder().delimiter("ch").keepDelimiter(Config.Delimiter.START).build(); + + List sentences = Utils.splitIntoSentences(CONTENT, config); + + assertThat(sentences).isNotNull(); + assertThat(sentences).hasSize(2); + assertThat(sentences.getFirst()).isEqualTo("This is the text I would like to "); + assertThat(sentences.getLast()).isEqualTo("chunk up. It is the example text for this exercise"); + } + + @Test + public void testSplitIntoSentencesWithDelimiterEnd() { + Config config = Config.builder().delimiter("ch").keepDelimiter(Config.Delimiter.END).build(); + + List sentences = Utils.splitIntoSentences(CONTENT, config); + + assertThat(sentences).isNotNull(); + assertThat(sentences).hasSize(2); + assertThat(sentences.getFirst()).isEqualTo("This is the text I would like to ch"); + assertThat(sentences.getLast()).isEqualTo("unk up. It is the example text for this exercise"); + } + +}