-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
14 changed files
with
674 additions
and
85 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
<parent> | ||
<groupId>com.github.PabloSanchi</groupId> | ||
<artifactId>jchunk</artifactId> | ||
<version>0.0.1-SNAPSHOT</version> | ||
</parent> | ||
|
||
<artifactId>jchunk-fixed</artifactId> | ||
<packaging>jar</packaging> | ||
<name>JChunk - Fixed Chunker</name> | ||
<description>Fixed Chunker for Java</description> | ||
<url>https://github.com/PabloSanchi/jchunk</url> | ||
|
||
<scm> | ||
<url>https://github.com/PabloSanchi/jchunk</url> | ||
<connection>git://github.com/PabloSanchi/jchunk.git</connection> | ||
<developerConnection>git@github.com:PabloSanchi/jchunk.git</developerConnection> | ||
</scm> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>com.github.PabloSanchi</groupId> | ||
<artifactId>jchunk-core</artifactId> | ||
<version>${project.parent.version}</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.springframework.boot</groupId> | ||
<artifactId>spring-boot</artifactId> | ||
</dependency> | ||
|
||
<!-- test --> | ||
<dependency> | ||
<groupId>org.springframework.boot</groupId> | ||
<artifactId>spring-boot-starter-test</artifactId> | ||
</dependency> | ||
</dependencies> | ||
|
||
</project> |
118 changes: 118 additions & 0 deletions
118
jchunk-fixed/src/main/java/jchunk/chunker/fixed/Config.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
package jchunk.chunker.fixed; | ||
|
||
import org.springframework.util.Assert; | ||
|
||
/** | ||
* Configuration for the fixed chunker | ||
* | ||
* @author Pablo Sanchidrian Herrera | ||
*/ | ||
public class Config { | ||
|
||
private final Integer chunkSize; | ||
|
||
private final Integer chunkOverlap; | ||
|
||
private final String delimiter; | ||
|
||
private final Boolean trimWhitespace; | ||
|
||
private final Delimiter keepDelimiter; | ||
|
||
public Integer getChunkSize() { | ||
return chunkSize; | ||
} | ||
|
||
public Integer getChunkOverlap() { | ||
return chunkOverlap; | ||
} | ||
|
||
public String getDelimiter() { | ||
return delimiter; | ||
} | ||
|
||
public Boolean getTrimWhitespace() { | ||
return trimWhitespace; | ||
} | ||
|
||
public Delimiter getKeepDelimiter() { | ||
return keepDelimiter; | ||
} | ||
|
||
public Config(Integer chunkSize, Integer chunkOverlap, String delimiter, Boolean trimWhitespace, | ||
Delimiter keepDelimiter) { | ||
this.chunkSize = chunkSize; | ||
this.chunkOverlap = chunkOverlap; | ||
this.delimiter = delimiter; | ||
this.trimWhitespace = trimWhitespace; | ||
this.keepDelimiter = keepDelimiter; | ||
} | ||
|
||
/** | ||
* {@return the default config} | ||
*/ | ||
public static Config defaultConfig() { | ||
return builder().build(); | ||
} | ||
|
||
public static Builder builder() { | ||
return new Builder(); | ||
} | ||
|
||
public static class Builder { | ||
|
||
private Integer chunkSize = 1000; | ||
|
||
private Integer chunkOverlap = 100; | ||
|
||
private String delimiter = " "; | ||
|
||
private Boolean trimWhitespace = true; | ||
|
||
private Delimiter keepDelimiter = Delimiter.NONE; | ||
|
||
public Builder chunkSize(Integer chunkSize) { | ||
Assert.isTrue(chunkSize > 0, "Chunk size must be greater than 0"); | ||
this.chunkSize = chunkSize; | ||
return this; | ||
} | ||
|
||
public Builder chunkOverlap(Integer chunkOverlap) { | ||
Assert.isTrue(chunkOverlap >= 0, "Chunk overlap must be greater than or equal to 0"); | ||
this.chunkOverlap = chunkOverlap; | ||
return this; | ||
} | ||
|
||
public Builder delimiter(String delimiter) { | ||
this.delimiter = delimiter; | ||
return this; | ||
} | ||
|
||
public Builder trimWhitespace(Boolean trimWhitespace) { | ||
this.trimWhitespace = trimWhitespace; | ||
return this; | ||
} | ||
|
||
public Builder keepDelimiter(Delimiter keepDelimiter) { | ||
this.keepDelimiter = keepDelimiter; | ||
return this; | ||
} | ||
|
||
public Config build() { | ||
Assert.isTrue(chunkSize > chunkOverlap, "Chunk size must be greater than chunk overlap"); | ||
return new Config(chunkSize, chunkOverlap, delimiter, trimWhitespace, keepDelimiter); | ||
} | ||
|
||
} | ||
|
||
/** | ||
* Enum to represent the delimiter configuration NONE: No delimiter START: Delimiter | ||
* at the start of the chunk END: Delimiter at the end of the chunk | ||
*/ | ||
public enum Delimiter { | ||
|
||
NONE, START, END | ||
|
||
} | ||
|
||
} |
31 changes: 31 additions & 0 deletions
31
jchunk-fixed/src/main/java/jchunk/chunker/fixed/FixedChunker.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
package jchunk.chunker.fixed; | ||
|
||
import jchunk.chunker.core.chunk.Chunk; | ||
import jchunk.chunker.core.chunk.IChunker; | ||
|
||
import java.util.List; | ||
|
||
/** | ||
* {@link FixedChunker} is a chunker that splits the content into fixed size chunks. | ||
* | ||
* @author Pablo Sanchidrian Herrera | ||
*/ | ||
public class FixedChunker implements IChunker { | ||
|
||
private final Config config; | ||
|
||
public FixedChunker() { | ||
this(Config.defaultConfig()); | ||
} | ||
|
||
public FixedChunker(Config config) { | ||
this.config = config; | ||
} | ||
|
||
@Override | ||
public List<Chunk> split(String content) { | ||
List<String> sentences = Utils.splitIntoSentences(content, config); | ||
return Utils.mergeSentences(sentences, config); | ||
} | ||
|
||
} |
Oops, something went wrong.