Skip to content

Commit

Permalink
Merge pull request #2 from gluschenko/cld2
Browse files Browse the repository at this point in the history
CLD2 support
  • Loading branch information
gluschenko authored Jul 4, 2024
2 parents 5a05808 + 2085387 commit 02d6f54
Show file tree
Hide file tree
Showing 29 changed files with 608 additions and 24 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/github-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ jobs:
fail-fast: false
matrix:
projectName:
- LanguageIdentification.CLD2.Native
- LanguageIdentification.CLD2
- LanguageIdentification.CLD3.Native
- LanguageIdentification.CLD3
- LanguageIdentification.FastText.Native
Expand Down Expand Up @@ -61,6 +63,8 @@ jobs:
fail-fast: false
matrix:
projectName:
- LanguageIdentification.CLD2.Native
- LanguageIdentification.CLD2
- LanguageIdentification.CLD3.Native
- LanguageIdentification.CLD3
- LanguageIdentification.FastText.Native
Expand Down
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@
[submodule "third_party/fastText"]
path = third_party/fastText
url = https://github.com/facebookresearch/fastText.git
[submodule "third_party/cld2"]
path = third_party/cld2
url = https://github.com/CLD2Owners/cld2.git
35 changes: 35 additions & 0 deletions src/LanguageIdentification.CLD2.ConsoleTest/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
FROM mcr.microsoft.com/dotnet/runtime:8.0 AS base
WORKDIR /app

FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build
ARG BUILD_CONFIGURATION=Release
WORKDIR /repo
COPY ["src/LanguageIdentification.CLD2.ConsoleTest/LanguageIdentification.CLD2.ConsoleTest.csproj", "src/LanguageIdentification.CLD2.ConsoleTest/"]
COPY ["src/LanguageIdentification.CLD2.Native/LanguageIdentification.CLD2.Native.csproj", "src/LanguageIdentification.CLD2.Native/"]
COPY ["src/LanguageIdentification.CLD2/LanguageIdentification.CLD2.csproj", "src/LanguageIdentification.CLD2/"]

### CLD2
COPY ["src/LanguageIdentification.CLD2.Native/Scripts/setup-build.sh", "src/LanguageIdentification.CLD2.Native/Scripts/"]
RUN cd /repo/src/LanguageIdentification.CLD2.Native && bash -c ./Scripts/setup-build.sh
###

RUN dotnet restore "./src/LanguageIdentification.CLD2.ConsoleTest/LanguageIdentification.CLD2.ConsoleTest.csproj"
COPY . .

### CLD2
RUN cd /repo/src/LanguageIdentification.CLD2.Native && bash -c ./Scripts/run-build.sh
###

WORKDIR /repo/src/LanguageIdentification.CLD2.ConsoleTest
RUN dotnet build "./LanguageIdentification.CLD2.ConsoleTest.csproj" -c $BUILD_CONFIGURATION -o /app/build

FROM build AS publish
ARG BUILD_CONFIGURATION=Release

WORKDIR /repo/src/LanguageIdentification.CLD2.ConsoleTest
RUN dotnet publish "./LanguageIdentification.CLD2.ConsoleTest.csproj" -c $BUILD_CONFIGURATION -o /app/publish /p:UseAppHost=false

FROM base AS final
WORKDIR /app
COPY --from=publish /app/publish .
ENTRYPOINT ["dotnet", "LanguageIdentification.CLD2.ConsoleTest.dll"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net8.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<DockerDefaultTargetOS>Linux</DockerDefaultTargetOS>
<ContainerDevelopmentMode>Regular</ContainerDevelopmentMode>
<DockerfileContext>..\..</DockerfileContext>
<DockerfileRunArguments>--name language-identification-cld2</DockerfileRunArguments>
</PropertyGroup>

<ItemGroup>
<None Include="..\.dockerignore" Link=".dockerignore">
<DependentUpon>$(DockerDefaultDockerfile)</DependentUpon>
</None>
</ItemGroup>

<ItemGroup>
<PackageReference Include="Microsoft.VisualStudio.Azure.Containers.Tools.Targets" Version="1.20.1" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\LanguageIdentification.CLD2\LanguageIdentification.CLD2.csproj" />
</ItemGroup>

</Project>
21 changes: 21 additions & 0 deletions src/LanguageIdentification.CLD2.ConsoleTest/Program.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
namespace LanguageIdentification.CLD2.ConsoleTest
{
internal class Program
{
static void Main(string[] args)
{
using var cld2 = new CLD2Detector();

string text = "Hello, how are you? Привіт, як справи? Привет, как дела?";

var topLangs = cld2.PredictLanguage(text);

foreach (var lang in topLangs)
{
Console.WriteLine($"Language: {lang.Language}, Probability: {lang.Probability}, IsReliable: {lang.IsReliable}, Proportion: {lang.Proportion}");
}

;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"profiles": {
"Docker": {
"commandName": "Docker"
}
}
}
2 changes: 2 additions & 0 deletions src/LanguageIdentification.CLD2.Native/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
libcld2.so
build_temp/**
14 changes: 14 additions & 0 deletions src/LanguageIdentification.CLD2.Native/CLD2NativeLibrary.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
using System.Runtime.InteropServices;

namespace LanguageIdentification.CLD2.Native
{
public static class CLD2NativeLibrary
{
public const string Name = "libcld2.so";

public static void LoadNativeLibrary()
{
NativeLibrary.Load(Name);
}
}
}
16 changes: 16 additions & 0 deletions src/LanguageIdentification.CLD2.Native/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build
ARG BUILD_CONFIGURATION=Release

WORKDIR /repo
COPY ["src/LanguageIdentification.CLD2.Native/LanguageIdentification.CLD2.Native.csproj", "src/LanguageIdentification.CLD2.Native/"]
COPY ["src/LanguageIdentification.CLD2.Native/Scripts/setup-build.sh", "src/LanguageIdentification.CLD2.Native/Scripts/"]
COPY ["src/LanguageIdentification.CLD2.Native/Scripts/setup-runtime.sh", "src/LanguageIdentification.CLD2.Native/Scripts/"]
COPY ["src/LanguageIdentification.CLD2.Native/Scripts/run-build.sh", "src/LanguageIdentification.CLD2.Native/Scripts/"]
WORKDIR /repo/src/LanguageIdentification.CLD2.Native

RUN bash ./Scripts/setup-runtime.sh
RUN bash ./Scripts/setup-build.sh

COPY . .

ENTRYPOINT ["bash", "./Scripts/run-build.sh"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net8.0</TargetFramework>
<Version>0.0.0.1</Version>
<PackageId>LanguageIdentification.CLD2.Native</PackageId>
<AssemblyName>LanguageIdentification.CLD2.Native</AssemblyName>
<RootNamespace>LanguageIdentification.CLD2.Native</RootNamespace>
<Authors>Alexander Gluschenko</Authors>
<RepositoryUrl>https://github.com/gluschenko/language-identification</RepositoryUrl>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
</PropertyGroup>

<PropertyGroup Condition="'$(Configuration)'=='Release'">
<GenerateDocumentationFile>true</GenerateDocumentationFile>
<NoWarn>$(NoWarn);1591,1573</NoWarn>
<DocumentationFile>..\LanguageIdentification.CLD2.Native.xml</DocumentationFile>
</PropertyGroup>

<PropertyGroup>
<DockerImageName>cld2-builder</DockerImageName>
<DockerContainerName>cld2-builder</DockerContainerName>
<CurrentAbsolutePath>$(MSBuildThisFileDirectory)</CurrentAbsolutePath>
<ThirdPartyAbsolutePath>$([System.IO.Path]::Combine('$(MSBuildThisFileDirectory)', '../../third_party/cld2/'))</ThirdPartyAbsolutePath>
<NativeFileAbsolutePath>$([System.IO.Path]::Combine('$(MSBuildThisFileDirectory)', 'libcld2.so'))</NativeFileAbsolutePath>
</PropertyGroup>

<Target Name="PreBuild" BeforeTargets="PreBuildEvent" Condition="!Exists('$(NativeFileAbsolutePath)')">
<Exec Command="docker build --file ./Dockerfile -t $(DockerImageName) ../.." />
<Exec Command="docker run --name $(DockerContainerName) -v $(ThirdPartyAbsolutePath):/repo/third_party/cld2/ -v $(CurrentAbsolutePath):/repo/src/LanguageIdentification.CLD2.Native $(DockerImageName)" />
<Exec Command="docker rm $(DockerContainerName) --force" />
</Target>

<ItemGroup>
<None Include="..\.dockerignore" Link=".dockerignore">
<DependentUpon>$(DockerDefaultDockerfile)</DependentUpon>
</None>
</ItemGroup>

<ItemGroup>
<Content Include="libcld2.so">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<PackageCopyToOutput>true</PackageCopyToOutput>
<PackagePath>runtimes/linux-x64/native</PackagePath>
<pack>true</pack>
<Visible>false</Visible>
</Content>
</ItemGroup>

</Project>
106 changes: 106 additions & 0 deletions src/LanguageIdentification.CLD2.Native/Native/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
project(cld2_bridge)

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
endif()

if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267")
endif()


set(CMAKE_MACOSX_RPATH 1)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)

include_directories(
${PROJECT_SOURCE_DIR}/cld2/internal
${PROJECT_SOURCE_DIR}/cld2/public
)

set(CLD2_SOURCES
${PROJECT_SOURCE_DIR}/cld2/internal/cldutil.cc
${PROJECT_SOURCE_DIR}/cld2/internal/cldutil_shared.cc
${PROJECT_SOURCE_DIR}/cld2/internal/compact_lang_det.cc
${PROJECT_SOURCE_DIR}/cld2/internal/compact_lang_det_hint_code.cc
${PROJECT_SOURCE_DIR}/cld2/internal/compact_lang_det_impl.cc
${PROJECT_SOURCE_DIR}/cld2/internal/debug.cc
${PROJECT_SOURCE_DIR}/cld2/internal/fixunicodevalue.cc
${PROJECT_SOURCE_DIR}/cld2/internal/generated_entities.cc
${PROJECT_SOURCE_DIR}/cld2/internal/generated_language.cc
${PROJECT_SOURCE_DIR}/cld2/internal/generated_ulscript.cc
${PROJECT_SOURCE_DIR}/cld2/internal/getonescriptspan.cc
${PROJECT_SOURCE_DIR}/cld2/internal/lang_script.cc
${PROJECT_SOURCE_DIR}/cld2/internal/offsetmap.cc
${PROJECT_SOURCE_DIR}/cld2/internal/scoreonescriptspan.cc
${PROJECT_SOURCE_DIR}/cld2/internal/tote.cc
${PROJECT_SOURCE_DIR}/cld2/internal/utf8statetable.cc
${PROJECT_SOURCE_DIR}/cld2/internal/generated_distinct_bi_0.cc

### Chrome (less perfect predictions)
# ${PROJECT_SOURCE_DIR}/cld2/internal/cld_generated_cjk_uni_prop_80.cc
# ${PROJECT_SOURCE_DIR}/cld2/internal/cld2_generated_cjk_compatible.cc
# ${PROJECT_SOURCE_DIR}/cld2/internal/cld_generated_cjk_delta_bi_4.cc
# ${PROJECT_SOURCE_DIR}/cld2/internal/generated_distinct_bi_0.cc
# ${PROJECT_SOURCE_DIR}/cld2/internal/cld2_generated_quadchrome_2.cc
# ${PROJECT_SOURCE_DIR}/cld2/internal/cld2_generated_deltaoctachrome.cc
# ${PROJECT_SOURCE_DIR}/cld2/internal/cld2_generated_distinctoctachrome.cc
# ${PROJECT_SOURCE_DIR}/cld2/internal/cld_generated_score_quad_octa_2.cc
###

### Full
${PROJECT_SOURCE_DIR}/cld2/internal/cld_generated_cjk_uni_prop_80.cc
${PROJECT_SOURCE_DIR}/cld2/internal/cld2_generated_cjk_compatible.cc
${PROJECT_SOURCE_DIR}/cld2/internal/cld_generated_cjk_delta_bi_32.cc
${PROJECT_SOURCE_DIR}/cld2/internal/generated_distinct_bi_0.cc
${PROJECT_SOURCE_DIR}/cld2/internal/cld2_generated_quad0122.cc
${PROJECT_SOURCE_DIR}/cld2/internal/cld2_generated_deltaocta0122.cc
${PROJECT_SOURCE_DIR}/cld2/internal/cld2_generated_distinctocta0122.cc
${PROJECT_SOURCE_DIR}/cld2/internal/cld_generated_score_quad_octa_0122.cc
###

${PROJECT_SOURCE_DIR}/cld2/internal/cld2_dynamic_compat.h
${PROJECT_SOURCE_DIR}/cld2/internal/cld2_dynamic_data_extractor.h
${PROJECT_SOURCE_DIR}/cld2/internal/cld2_dynamic_data.h
${PROJECT_SOURCE_DIR}/cld2/internal/cld2_dynamic_data_loader.h
${PROJECT_SOURCE_DIR}/cld2/internal/cld2tablesummary.h
${PROJECT_SOURCE_DIR}/cld2/internal/cldutil.h
${PROJECT_SOURCE_DIR}/cld2/internal/cldutil_offline.h
${PROJECT_SOURCE_DIR}/cld2/internal/cldutil_shared.h
${PROJECT_SOURCE_DIR}/cld2/internal/compact_lang_det_hint_code.h
${PROJECT_SOURCE_DIR}/cld2/internal/compact_lang_det_impl.h
${PROJECT_SOURCE_DIR}/cld2/internal/debug.h
${PROJECT_SOURCE_DIR}/cld2/internal/fixunicodevalue.h
${PROJECT_SOURCE_DIR}/cld2/internal/generated_language.h
${PROJECT_SOURCE_DIR}/cld2/internal/generated_ulscript.h
${PROJECT_SOURCE_DIR}/cld2/internal/getonescriptspan.h
${PROJECT_SOURCE_DIR}/cld2/internal/integral_types.h
${PROJECT_SOURCE_DIR}/cld2/internal/lang_script.h
${PROJECT_SOURCE_DIR}/cld2/internal/langspan.h
${PROJECT_SOURCE_DIR}/cld2/internal/offsetmap.h
${PROJECT_SOURCE_DIR}/cld2/internal/port.h
${PROJECT_SOURCE_DIR}/cld2/internal/scoreonescriptspan.h
${PROJECT_SOURCE_DIR}/cld2/internal/stringpiece.h
${PROJECT_SOURCE_DIR}/cld2/internal/tote.h
${PROJECT_SOURCE_DIR}/cld2/internal/unittest_data.h
${PROJECT_SOURCE_DIR}/cld2/internal/utf8acceptinterchange.h
${PROJECT_SOURCE_DIR}/cld2/internal/utf8prop_lettermarkscriptnum.h
${PROJECT_SOURCE_DIR}/cld2/internal/utf8repl_lettermarklower.h
${PROJECT_SOURCE_DIR}/cld2/internal/utf8scannot_lettermarkspecial.h
${PROJECT_SOURCE_DIR}/cld2/internal/utf8statetable.h
${PROJECT_SOURCE_DIR}/cld2/public/compact_lang_det.h
${PROJECT_SOURCE_DIR}/cld2/public/encodings.h

# bindings
${PROJECT_SOURCE_DIR}/binding.cc
${PROJECT_SOURCE_DIR}/binding.h
)

add_library(objlib OBJECT ${CLD2_SOURCES})

add_library(cld2 SHARED $<TARGET_OBJECTS:objlib>)

set_target_properties(cld2 PROPERTIES PUBLIC_HEADER "${PROJECT_SOURCE_DIR}/cld2/binding.h")
Loading

0 comments on commit 02d6f54

Please sign in to comment.