Skip to content

Commit

Permalink
Merge pull request #25 from PabloSanchi/dev/pablosanchi/refactor-fixe…
Browse files Browse the repository at this point in the history
…dchunker

refactor: minor code improvements
  • Loading branch information
PabloSanchi authored Dec 25, 2024
2 parents 8757b38 + d303d25 commit b69ed1f
Showing 1 changed file with 41 additions and 19 deletions.
60 changes: 41 additions & 19 deletions jchunk-fixed/src/main/java/jchunk/chunker/fixed/Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@

import jchunk.chunker.core.chunk.Chunk;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Logger;
import java.util.regex.Pattern;
Expand Down Expand Up @@ -50,7 +48,7 @@ public static List<String> splitIntoSentences(String content, Config config) {
private static List<String> splitWithDelimiter(String content, String delimiter, Config.Delimiter keepDelimiter) {

if (keepDelimiter == Config.Delimiter.NONE) {
return Arrays.stream(content.split(Pattern.quote(delimiter))).filter(s -> !s.isBlank()).toList();
return Arrays.stream(content.split(Pattern.quote(delimiter))).filter(s -> !s.isEmpty()).toList();
}

String withDelimiter = "((?<=%1$s)|(?=%1$s))";
Expand All @@ -70,11 +68,11 @@ private static List<String> splitWithDelimiterStart(List<String> preSplits) {
List<String> splits = new ArrayList<>();

splits.add(preSplits.getFirst());
IntStream.range(1, preSplits.size())
IntStream.range(1, preSplits.size() - 1)
.filter(i -> i % 2 == 1)
.forEach(i -> splits.add(preSplits.get(i).concat(preSplits.get(i + 1))));

return splits.stream().filter(s -> !s.isBlank()).toList();
return splits.stream().filter(s -> !s.isEmpty()).toList();
}

/**
Expand All @@ -91,7 +89,7 @@ private static List<String> splitWithDelimiterEnd(List<String> preSplits) {
.forEach(i -> splits.add(preSplits.get(i).concat(preSplits.get(i + 1))));
splits.add(preSplits.getLast());

return splits.stream().filter(s -> !s.isBlank()).toList();
return splits.stream().filter(s -> !s.isEmpty()).toList();
}

/**
Expand All @@ -110,7 +108,7 @@ static List<Chunk> mergeSentences(List<String> sentences, Config config) {
int delimiterLen = delimiter.length();

List<Chunk> chunks = new ArrayList<>();
List<String> currentChunk = new ArrayList<>();
Deque<String> currentChunk = new LinkedList<>();

AtomicInteger chunkIndex = new AtomicInteger(0);

Expand All @@ -123,14 +121,8 @@ static List<Chunk> mergeSentences(List<String> sentences, Config config) {
}

if (!currentChunk.isEmpty()) {
String generatedSentence = joinSentences(currentChunk, delimiter, trimWhitespace);
chunks.add(new Chunk(chunkIndex.getAndIncrement(), generatedSentence));

while (currentLen > chunkOverlap
|| (currentLen + sentenceLength + (currentChunk.isEmpty() ? 0 : delimiterLen) > chunkSize
&& currentLen > 0)) {
currentLen -= currentChunk.removeFirst().length() + (currentChunk.isEmpty() ? 0 : delimiterLen);
}
addChunk(chunks, currentChunk, delimiter, trimWhitespace, chunkIndex);
currentLen = adjustCurrentChunkForOverlap(currentChunk, currentLen, chunkOverlap, delimiterLen);
}
}

Expand All @@ -139,21 +131,51 @@ static List<Chunk> mergeSentences(List<String> sentences, Config config) {
}

if (!currentChunk.isEmpty()) {
String generatedSentence = joinSentences(currentChunk, config.getDelimiter(), config.getTrimWhitespace());
chunks.add(new Chunk(chunkIndex.getAndIncrement(), generatedSentence));
addChunk(chunks, currentChunk, delimiter, trimWhitespace, chunkIndex);
}

return chunks;
}

/**
* Adds the chunk to the list of chunks.
* @param chunks the list of chunks
* @param currentChunk the current chunk
* @param delimiter the delimiter
* @param trimWhitespace whether to trim the whitespace
* @param index the index of the chunk
*/
private static void addChunk(List<Chunk> chunks, Deque<String> currentChunk, String delimiter,
boolean trimWhitespace, AtomicInteger index) {
String generatedSentence = joinSentences(currentChunk, delimiter, trimWhitespace);
Chunk chunk = Chunk.builder().id(index.getAndIncrement()).content(generatedSentence).build();
chunks.add(chunk);
}

/**
* Adjusts the current chunk for overlap.
* @param currentChunk the current chunk
* @param currentLen the current length of the chunk
* @param chunkOverlap the overlap between chunks
* @param delimiterLen the length of the delimiter
* @return the adjusted length of the chunk
*/
private static int adjustCurrentChunkForOverlap(Deque<String> currentChunk, int currentLen, int chunkOverlap,
int delimiterLen) {
while (currentLen > chunkOverlap && !currentChunk.isEmpty()) {
currentLen -= currentChunk.removeFirst().length() + (currentChunk.isEmpty() ? 0 : delimiterLen);
}
return currentLen;
}

/**
* Joins the sentences into a single sentence.
* @param sentences the sentences to join
* @param delimiter the delimiter to join the sentences
* @param trimWhitespace whether to trim the whitespace
* @return the generated sentence
*/
private static String joinSentences(List<String> sentences, String delimiter, Boolean trimWhitespace) {
private static String joinSentences(Deque<String> sentences, String delimiter, Boolean trimWhitespace) {
String generatedSentence = String.join(delimiter, sentences);
if (trimWhitespace) {
generatedSentence = generatedSentence.trim();
Expand Down

0 comments on commit b69ed1f

Please sign in to comment.