-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from gluschenko/cld2
CLD2 support
- Loading branch information
Showing
29 changed files
with
608 additions
and
24 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
FROM mcr.microsoft.com/dotnet/runtime:8.0 AS base | ||
WORKDIR /app | ||
|
||
FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build | ||
ARG BUILD_CONFIGURATION=Release | ||
WORKDIR /repo | ||
COPY ["src/LanguageIdentification.CLD2.ConsoleTest/LanguageIdentification.CLD2.ConsoleTest.csproj", "src/LanguageIdentification.CLD2.ConsoleTest/"] | ||
COPY ["src/LanguageIdentification.CLD2.Native/LanguageIdentification.CLD2.Native.csproj", "src/LanguageIdentification.CLD2.Native/"] | ||
COPY ["src/LanguageIdentification.CLD2/LanguageIdentification.CLD2.csproj", "src/LanguageIdentification.CLD2/"] | ||
|
||
### CLD2 | ||
COPY ["src/LanguageIdentification.CLD2.Native/Scripts/setup-build.sh", "src/LanguageIdentification.CLD2.Native/Scripts/"] | ||
RUN cd /repo/src/LanguageIdentification.CLD2.Native && bash -c ./Scripts/setup-build.sh | ||
### | ||
|
||
RUN dotnet restore "./src/LanguageIdentification.CLD2.ConsoleTest/LanguageIdentification.CLD2.ConsoleTest.csproj" | ||
COPY . . | ||
|
||
### CLD2 | ||
RUN cd /repo/src/LanguageIdentification.CLD2.Native && bash -c ./Scripts/run-build.sh | ||
### | ||
|
||
WORKDIR /repo/src/LanguageIdentification.CLD2.ConsoleTest | ||
RUN dotnet build "./LanguageIdentification.CLD2.ConsoleTest.csproj" -c $BUILD_CONFIGURATION -o /app/build | ||
|
||
FROM build AS publish | ||
ARG BUILD_CONFIGURATION=Release | ||
|
||
WORKDIR /repo/src/LanguageIdentification.CLD2.ConsoleTest | ||
RUN dotnet publish "./LanguageIdentification.CLD2.ConsoleTest.csproj" -c $BUILD_CONFIGURATION -o /app/publish /p:UseAppHost=false | ||
|
||
FROM base AS final | ||
WORKDIR /app | ||
COPY --from=publish /app/publish . | ||
ENTRYPOINT ["dotnet", "LanguageIdentification.CLD2.ConsoleTest.dll"] |
28 changes: 28 additions & 0 deletions
28
src/LanguageIdentification.CLD2.ConsoleTest/LanguageIdentification.CLD2.ConsoleTest.csproj
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
<Project Sdk="Microsoft.NET.Sdk"> | ||
|
||
<PropertyGroup> | ||
<OutputType>Exe</OutputType> | ||
<TargetFramework>net8.0</TargetFramework> | ||
<ImplicitUsings>enable</ImplicitUsings> | ||
<Nullable>enable</Nullable> | ||
<DockerDefaultTargetOS>Linux</DockerDefaultTargetOS> | ||
<ContainerDevelopmentMode>Regular</ContainerDevelopmentMode> | ||
<DockerfileContext>..\..</DockerfileContext> | ||
<DockerfileRunArguments>--name language-identification-cld2</DockerfileRunArguments> | ||
</PropertyGroup> | ||
|
||
<ItemGroup> | ||
<None Include="..\.dockerignore" Link=".dockerignore"> | ||
<DependentUpon>$(DockerDefaultDockerfile)</DependentUpon> | ||
</None> | ||
</ItemGroup> | ||
|
||
<ItemGroup> | ||
<PackageReference Include="Microsoft.VisualStudio.Azure.Containers.Tools.Targets" Version="1.20.1" /> | ||
</ItemGroup> | ||
|
||
<ItemGroup> | ||
<ProjectReference Include="..\LanguageIdentification.CLD2\LanguageIdentification.CLD2.csproj" /> | ||
</ItemGroup> | ||
|
||
</Project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
namespace LanguageIdentification.CLD2.ConsoleTest | ||
{ | ||
internal class Program | ||
{ | ||
static void Main(string[] args) | ||
{ | ||
using var cld2 = new CLD2Detector(); | ||
|
||
string text = "Hello, how are you? Привіт, як справи? Привет, как дела?"; | ||
|
||
var topLangs = cld2.PredictLanguage(text); | ||
|
||
foreach (var lang in topLangs) | ||
{ | ||
Console.WriteLine($"Language: {lang.Language}, Probability: {lang.Probability}, IsReliable: {lang.IsReliable}, Proportion: {lang.Proportion}"); | ||
} | ||
|
||
; | ||
} | ||
} | ||
} |
7 changes: 7 additions & 0 deletions
7
src/LanguageIdentification.CLD2.ConsoleTest/Properties/launchSettings.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
{ | ||
"profiles": { | ||
"Docker": { | ||
"commandName": "Docker" | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
libcld2.so | ||
build_temp/** |
14 changes: 14 additions & 0 deletions
14
src/LanguageIdentification.CLD2.Native/CLD2NativeLibrary.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
using System.Runtime.InteropServices; | ||
|
||
namespace LanguageIdentification.CLD2.Native | ||
{ | ||
public static class CLD2NativeLibrary | ||
{ | ||
public const string Name = "libcld2.so"; | ||
|
||
public static void LoadNativeLibrary() | ||
{ | ||
NativeLibrary.Load(Name); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build | ||
ARG BUILD_CONFIGURATION=Release | ||
|
||
WORKDIR /repo | ||
COPY ["src/LanguageIdentification.CLD2.Native/LanguageIdentification.CLD2.Native.csproj", "src/LanguageIdentification.CLD2.Native/"] | ||
COPY ["src/LanguageIdentification.CLD2.Native/Scripts/setup-build.sh", "src/LanguageIdentification.CLD2.Native/Scripts/"] | ||
COPY ["src/LanguageIdentification.CLD2.Native/Scripts/setup-runtime.sh", "src/LanguageIdentification.CLD2.Native/Scripts/"] | ||
COPY ["src/LanguageIdentification.CLD2.Native/Scripts/run-build.sh", "src/LanguageIdentification.CLD2.Native/Scripts/"] | ||
WORKDIR /repo/src/LanguageIdentification.CLD2.Native | ||
|
||
RUN bash ./Scripts/setup-runtime.sh | ||
RUN bash ./Scripts/setup-build.sh | ||
|
||
COPY . . | ||
|
||
ENTRYPOINT ["bash", "./Scripts/run-build.sh"] |
52 changes: 52 additions & 0 deletions
52
src/LanguageIdentification.CLD2.Native/LanguageIdentification.CLD2.Native.csproj
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
<Project Sdk="Microsoft.NET.Sdk"> | ||
|
||
<PropertyGroup> | ||
<TargetFramework>net8.0</TargetFramework> | ||
<Version>0.0.0.1</Version> | ||
<PackageId>LanguageIdentification.CLD2.Native</PackageId> | ||
<AssemblyName>LanguageIdentification.CLD2.Native</AssemblyName> | ||
<RootNamespace>LanguageIdentification.CLD2.Native</RootNamespace> | ||
<Authors>Alexander Gluschenko</Authors> | ||
<RepositoryUrl>https://github.com/gluschenko/language-identification</RepositoryUrl> | ||
<ImplicitUsings>enable</ImplicitUsings> | ||
<Nullable>enable</Nullable> | ||
<AllowUnsafeBlocks>true</AllowUnsafeBlocks> | ||
</PropertyGroup> | ||
|
||
<PropertyGroup Condition="'$(Configuration)'=='Release'"> | ||
<GenerateDocumentationFile>true</GenerateDocumentationFile> | ||
<NoWarn>$(NoWarn);1591,1573</NoWarn> | ||
<DocumentationFile>..\LanguageIdentification.CLD2.Native.xml</DocumentationFile> | ||
</PropertyGroup> | ||
|
||
<PropertyGroup> | ||
<DockerImageName>cld2-builder</DockerImageName> | ||
<DockerContainerName>cld2-builder</DockerContainerName> | ||
<CurrentAbsolutePath>$(MSBuildThisFileDirectory)</CurrentAbsolutePath> | ||
<ThirdPartyAbsolutePath>$([System.IO.Path]::Combine('$(MSBuildThisFileDirectory)', '../../third_party/cld2/'))</ThirdPartyAbsolutePath> | ||
<NativeFileAbsolutePath>$([System.IO.Path]::Combine('$(MSBuildThisFileDirectory)', 'libcld2.so'))</NativeFileAbsolutePath> | ||
</PropertyGroup> | ||
|
||
<Target Name="PreBuild" BeforeTargets="PreBuildEvent" Condition="!Exists('$(NativeFileAbsolutePath)')"> | ||
<Exec Command="docker build --file ./Dockerfile -t $(DockerImageName) ../.." /> | ||
<Exec Command="docker run --name $(DockerContainerName) -v $(ThirdPartyAbsolutePath):/repo/third_party/cld2/ -v $(CurrentAbsolutePath):/repo/src/LanguageIdentification.CLD2.Native $(DockerImageName)" /> | ||
<Exec Command="docker rm $(DockerContainerName) --force" /> | ||
</Target> | ||
|
||
<ItemGroup> | ||
<None Include="..\.dockerignore" Link=".dockerignore"> | ||
<DependentUpon>$(DockerDefaultDockerfile)</DependentUpon> | ||
</None> | ||
</ItemGroup> | ||
|
||
<ItemGroup> | ||
<Content Include="libcld2.so"> | ||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> | ||
<PackageCopyToOutput>true</PackageCopyToOutput> | ||
<PackagePath>runtimes/linux-x64/native</PackagePath> | ||
<pack>true</pack> | ||
<Visible>false</Visible> | ||
</Content> | ||
</ItemGroup> | ||
|
||
</Project> |
106 changes: 106 additions & 0 deletions
106
src/LanguageIdentification.CLD2.Native/Native/CMakeLists.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
cmake_minimum_required(VERSION 2.8 FATAL_ERROR) | ||
project(cld2_bridge) | ||
|
||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") | ||
if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") | ||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") | ||
endif() | ||
|
||
if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") | ||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing") | ||
elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") | ||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267") | ||
endif() | ||
|
||
|
||
set(CMAKE_MACOSX_RPATH 1) | ||
set(CMAKE_POSITION_INDEPENDENT_CODE ON) | ||
|
||
include_directories( | ||
${PROJECT_SOURCE_DIR}/cld2/internal | ||
${PROJECT_SOURCE_DIR}/cld2/public | ||
) | ||
|
||
set(CLD2_SOURCES | ||
${PROJECT_SOURCE_DIR}/cld2/internal/cldutil.cc | ||
${PROJECT_SOURCE_DIR}/cld2/internal/cldutil_shared.cc | ||
${PROJECT_SOURCE_DIR}/cld2/internal/compact_lang_det.cc | ||
${PROJECT_SOURCE_DIR}/cld2/internal/compact_lang_det_hint_code.cc | ||
${PROJECT_SOURCE_DIR}/cld2/internal/compact_lang_det_impl.cc | ||
${PROJECT_SOURCE_DIR}/cld2/internal/debug.cc | ||
${PROJECT_SOURCE_DIR}/cld2/internal/fixunicodevalue.cc | ||
${PROJECT_SOURCE_DIR}/cld2/internal/generated_entities.cc | ||
${PROJECT_SOURCE_DIR}/cld2/internal/generated_language.cc | ||
${PROJECT_SOURCE_DIR}/cld2/internal/generated_ulscript.cc | ||
${PROJECT_SOURCE_DIR}/cld2/internal/getonescriptspan.cc | ||
${PROJECT_SOURCE_DIR}/cld2/internal/lang_script.cc | ||
${PROJECT_SOURCE_DIR}/cld2/internal/offsetmap.cc | ||
${PROJECT_SOURCE_DIR}/cld2/internal/scoreonescriptspan.cc | ||
${PROJECT_SOURCE_DIR}/cld2/internal/tote.cc | ||
${PROJECT_SOURCE_DIR}/cld2/internal/utf8statetable.cc | ||
${PROJECT_SOURCE_DIR}/cld2/internal/generated_distinct_bi_0.cc | ||
|
||
### Chrome (less perfect predictions) | ||
# ${PROJECT_SOURCE_DIR}/cld2/internal/cld_generated_cjk_uni_prop_80.cc | ||
# ${PROJECT_SOURCE_DIR}/cld2/internal/cld2_generated_cjk_compatible.cc | ||
# ${PROJECT_SOURCE_DIR}/cld2/internal/cld_generated_cjk_delta_bi_4.cc | ||
# ${PROJECT_SOURCE_DIR}/cld2/internal/generated_distinct_bi_0.cc | ||
# ${PROJECT_SOURCE_DIR}/cld2/internal/cld2_generated_quadchrome_2.cc | ||
# ${PROJECT_SOURCE_DIR}/cld2/internal/cld2_generated_deltaoctachrome.cc | ||
# ${PROJECT_SOURCE_DIR}/cld2/internal/cld2_generated_distinctoctachrome.cc | ||
# ${PROJECT_SOURCE_DIR}/cld2/internal/cld_generated_score_quad_octa_2.cc | ||
### | ||
|
||
### Full | ||
${PROJECT_SOURCE_DIR}/cld2/internal/cld_generated_cjk_uni_prop_80.cc | ||
${PROJECT_SOURCE_DIR}/cld2/internal/cld2_generated_cjk_compatible.cc | ||
${PROJECT_SOURCE_DIR}/cld2/internal/cld_generated_cjk_delta_bi_32.cc | ||
${PROJECT_SOURCE_DIR}/cld2/internal/generated_distinct_bi_0.cc | ||
${PROJECT_SOURCE_DIR}/cld2/internal/cld2_generated_quad0122.cc | ||
${PROJECT_SOURCE_DIR}/cld2/internal/cld2_generated_deltaocta0122.cc | ||
${PROJECT_SOURCE_DIR}/cld2/internal/cld2_generated_distinctocta0122.cc | ||
${PROJECT_SOURCE_DIR}/cld2/internal/cld_generated_score_quad_octa_0122.cc | ||
### | ||
|
||
${PROJECT_SOURCE_DIR}/cld2/internal/cld2_dynamic_compat.h | ||
${PROJECT_SOURCE_DIR}/cld2/internal/cld2_dynamic_data_extractor.h | ||
${PROJECT_SOURCE_DIR}/cld2/internal/cld2_dynamic_data.h | ||
${PROJECT_SOURCE_DIR}/cld2/internal/cld2_dynamic_data_loader.h | ||
${PROJECT_SOURCE_DIR}/cld2/internal/cld2tablesummary.h | ||
${PROJECT_SOURCE_DIR}/cld2/internal/cldutil.h | ||
${PROJECT_SOURCE_DIR}/cld2/internal/cldutil_offline.h | ||
${PROJECT_SOURCE_DIR}/cld2/internal/cldutil_shared.h | ||
${PROJECT_SOURCE_DIR}/cld2/internal/compact_lang_det_hint_code.h | ||
${PROJECT_SOURCE_DIR}/cld2/internal/compact_lang_det_impl.h | ||
${PROJECT_SOURCE_DIR}/cld2/internal/debug.h | ||
${PROJECT_SOURCE_DIR}/cld2/internal/fixunicodevalue.h | ||
${PROJECT_SOURCE_DIR}/cld2/internal/generated_language.h | ||
${PROJECT_SOURCE_DIR}/cld2/internal/generated_ulscript.h | ||
${PROJECT_SOURCE_DIR}/cld2/internal/getonescriptspan.h | ||
${PROJECT_SOURCE_DIR}/cld2/internal/integral_types.h | ||
${PROJECT_SOURCE_DIR}/cld2/internal/lang_script.h | ||
${PROJECT_SOURCE_DIR}/cld2/internal/langspan.h | ||
${PROJECT_SOURCE_DIR}/cld2/internal/offsetmap.h | ||
${PROJECT_SOURCE_DIR}/cld2/internal/port.h | ||
${PROJECT_SOURCE_DIR}/cld2/internal/scoreonescriptspan.h | ||
${PROJECT_SOURCE_DIR}/cld2/internal/stringpiece.h | ||
${PROJECT_SOURCE_DIR}/cld2/internal/tote.h | ||
${PROJECT_SOURCE_DIR}/cld2/internal/unittest_data.h | ||
${PROJECT_SOURCE_DIR}/cld2/internal/utf8acceptinterchange.h | ||
${PROJECT_SOURCE_DIR}/cld2/internal/utf8prop_lettermarkscriptnum.h | ||
${PROJECT_SOURCE_DIR}/cld2/internal/utf8repl_lettermarklower.h | ||
${PROJECT_SOURCE_DIR}/cld2/internal/utf8scannot_lettermarkspecial.h | ||
${PROJECT_SOURCE_DIR}/cld2/internal/utf8statetable.h | ||
${PROJECT_SOURCE_DIR}/cld2/public/compact_lang_det.h | ||
${PROJECT_SOURCE_DIR}/cld2/public/encodings.h | ||
|
||
# bindings | ||
${PROJECT_SOURCE_DIR}/binding.cc | ||
${PROJECT_SOURCE_DIR}/binding.h | ||
) | ||
|
||
add_library(objlib OBJECT ${CLD2_SOURCES}) | ||
|
||
add_library(cld2 SHARED $<TARGET_OBJECTS:objlib>) | ||
|
||
set_target_properties(cld2 PROPERTIES PUBLIC_HEADER "${PROJECT_SOURCE_DIR}/cld2/binding.h") |
Oops, something went wrong.