Skip to content

Commit

Permalink
Merge pull request #20 from PabloSanchi/dev/pablosanchi/fixed-chunker…
Browse files Browse the repository at this point in the history
…-bug

fix: fixed-chunker bug
  • Loading branch information
PabloSanchi authored Dec 24, 2024
2 parents 307911c + 03d3457 commit 6c72aa0
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 11 deletions.
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ The chunk size is the number of characters each chunk will contain. For example,
**Example:**
- Input Text: "This is an example of character splitting."
- Chunk Size: 10
- Output Chunks: `["This is an", " example o", "f characte", "r splitti", "ng."]`
- Output Chunks: `["This is an", " example o", "f characte", "r splittin", "g."]`

### 2. Chunk Overlap
Chunk overlap refers to the number of characters that will overlap between consecutive chunks. This helps in maintaining context across chunks by ensuring that a portion of the text at the end of one chunk is repeated at the beginning of the next chunk.
Expand All @@ -61,13 +61,14 @@ Chunk overlap refers to the number of characters that will overlap between conse
- Input Text: "This is an example of character splitting."
- Chunk Size: 10
- Chunk Overlap: 4
- Output Chunks: `["This is an", " an examp", "mple of ch", "aracter sp", " splitting."]`
- Output Chunks: `["This is an", "s an examp", "xample of", "of charac", "aracter sp", "r splittin", "tting."]`

### 3. Separators
Separators are specific character sequences used to split the text. For instance, you might want to split your text at every comma or period.

**Example:**
- Input Text: "This is an example. Let's split on periods. Okay?"
- Chunk Size: 20
- Separator: ". "
- Output Chunks: ["This is an example", "Let's split on periods", "Okay?"]

Expand Down
3 changes: 2 additions & 1 deletion jchunk-fixed/src/main/java/jchunk/chunker/fixed/Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import java.util.stream.IntStream;

public class Utils {
Expand Down Expand Up @@ -49,7 +50,7 @@ public static List<String> splitIntoSentences(String content, Config config) {
private static List<String> splitWithDelimiter(String content, String delimiter, Config.Delimiter keepDelimiter) {

if (keepDelimiter == Config.Delimiter.NONE) {
return Arrays.stream(content.split(delimiter)).filter(s -> !s.isBlank()).toList();
return Arrays.stream(content.split(Pattern.quote(delimiter))).filter(s -> !s.isBlank()).toList();
}

String withDelimiter = "((?<=%1$s)|(?=%1$s))";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,49 @@ class FixedChunkerIT {

private static final String CONTENT = "This is the text I would like to chunk up. It is the example text for this exercise";

// @formatter:off

@Test
void testSplitWithDefaultConfig() {
chunker = new FixedChunker();
List<Chunk> expectedChunks = List
.of(new Chunk(0, "This is the text I would like to chunk up. It is the example text for this exercise"));
List<Chunk> expectedChunks = List.of(
new Chunk(0, "This is the text I would like to chunk up. It is the example text for this exercise")
);

List<Chunk> chunks = chunker.split(CONTENT);

assertThat(chunks).isNotNull().hasSize(1);

}

@Test
void testSplitWithCustomDelimiter() {
Config config = Config.builder().chunkSize(20).chunkOverlap(0).delimiter(".").build();

chunker = new FixedChunker(config);

List<Chunk> expectedChunks = List.of(
new Chunk(0, "This is an example"),
new Chunk(1, "Let's split on periods"),
new Chunk(2, "Okay?")
);

List<Chunk> chunks = chunker.split("This is an example. Let's split on periods. Okay?");

assertThat(chunks).isNotNull().hasSize(3).containsExactlyElementsOf(expectedChunks);
}

@Test
void testSplitWithCustomConfig() {
Config config = Config.builder().chunkSize(35).chunkOverlap(4).delimiter("").build();

chunker = new FixedChunker(config);

List<Chunk> expectedChunks = List.of(new Chunk(0, "This is the text I would like to ch"),
new Chunk(1, "o chunk up. It is the example text"), new Chunk(2, "ext for this exercise"));
List<Chunk> expectedChunks = List.of(
new Chunk(0, "This is the text I would like to ch"),
new Chunk(1, "o chunk up. It is the example text"),
new Chunk(2, "ext for this exercise")
);

List<Chunk> chunks = chunker.split(CONTENT);

Expand All @@ -45,8 +68,11 @@ void testSplitWithCustomConfigNoWhiteSpace() {

chunker = new FixedChunker(config);

List<Chunk> expectedChunks = List.of(new Chunk(0, "This is the text I would like to ch"),
new Chunk(1, "unk up. It is the example text for "), new Chunk(2, "this exercise"));
List<Chunk> expectedChunks = List.of(
new Chunk(0, "This is the text I would like to ch"),
new Chunk(1, "unk up. It is the example text for "),
new Chunk(2, "this exercise")
);

List<Chunk> chunks = chunker.split(CONTENT);

Expand All @@ -65,12 +91,16 @@ void testSplitWithCustomConfigWithKeepDelimiterSetToNone() {

chunker = new FixedChunker(config);

List<Chunk> expectedChunks = List.of(new Chunk(0, "This is the text I would like to"),
new Chunk(1, "unk up. It is the example text for this exercise"));
List<Chunk> expectedChunks = List.of(
new Chunk(0, "This is the text I would like to"),
new Chunk(1, "unk up. It is the example text for this exercise")
);

List<Chunk> chunks = chunker.split(CONTENT);

assertThat(chunks).isNotNull().hasSize(2).containsExactlyElementsOf(expectedChunks);
}

// @formatter:on

}

0 comments on commit 6c72aa0

Please sign in to comment.