From 6e82a376b1bb85cb006039685d1046dc0f53761c Mon Sep 17 00:00:00 2001 From: Alexander Gluschenko Date: Tue, 2 Jul 2024 21:58:09 +0400 Subject: [PATCH 1/7] Add CLD2 as submodule --- .gitmodules | 3 +++ third_party/cld2 | 1 + 2 files changed, 4 insertions(+) create mode 160000 third_party/cld2 diff --git a/.gitmodules b/.gitmodules index b5c9250..b46dbd8 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "third_party/fastText"] path = third_party/fastText url = https://github.com/facebookresearch/fastText.git +[submodule "third_party/cld2"] + path = third_party/cld2 + url = https://github.com/CLD2Owners/cld2.git diff --git a/third_party/cld2 b/third_party/cld2 new file mode 160000 index 0000000..b56fa78 --- /dev/null +++ b/third_party/cld2 @@ -0,0 +1 @@ +Subproject commit b56fa78a2fe44ac2851bae5bf4f4693a0644da7b From 8ebde065809ab12ceb0cd06baac7b0ce60375f36 Mon Sep 17 00:00:00 2001 From: Alexander Gluschenko Date: Thu, 4 Jul 2024 21:28:26 +0400 Subject: [PATCH 2/7] Add LanguageIdentification.CLD2 --- .../Dockerfile | 35 ++++++ ...uageIdentification.CLD2.ConsoleTest.csproj | 28 +++++ .../Program.cs | 21 ++++ .../Properties/launchSettings.json | 7 ++ .../.gitignore | 2 + .../CLD2NativeLibrary.cs | 14 +++ .../Dockerfile | 16 +++ .../LanguageIdentification.CLD2.Native.csproj | 52 +++++++++ .../Native/CMakeLists.txt | 106 ++++++++++++++++++ .../Native/binding.cc | 77 +++++++++++++ .../Native/binding.h | 28 +++++ .../Scripts/run-build.sh | 30 +++++ .../Scripts/setup-build.sh | 8 ++ .../Scripts/setup-runtime.sh | 5 + .../CLD2Detector.cs | 57 ++++++++++ .../CLD2DetectorWrapper.cs | 13 +++ .../CLD2PredictionResult.cs | 22 ++++ .../LanguageIdentification.CLD2.csproj | 28 +++++ src/LanguageIdentification.sln | 29 ++++- 19 files changed, 575 insertions(+), 3 deletions(-) create mode 100644 src/LanguageIdentification.CLD2.ConsoleTest/Dockerfile create mode 100644 src/LanguageIdentification.CLD2.ConsoleTest/LanguageIdentification.CLD2.ConsoleTest.csproj create mode 100644 src/LanguageIdentification.CLD2.ConsoleTest/Program.cs create mode 100644 src/LanguageIdentification.CLD2.ConsoleTest/Properties/launchSettings.json create mode 100644 src/LanguageIdentification.CLD2.Native/.gitignore create mode 100644 src/LanguageIdentification.CLD2.Native/CLD2NativeLibrary.cs create mode 100644 src/LanguageIdentification.CLD2.Native/Dockerfile create mode 100644 src/LanguageIdentification.CLD2.Native/LanguageIdentification.CLD2.Native.csproj create mode 100644 src/LanguageIdentification.CLD2.Native/Native/CMakeLists.txt create mode 100644 src/LanguageIdentification.CLD2.Native/Native/binding.cc create mode 100644 src/LanguageIdentification.CLD2.Native/Native/binding.h create mode 100644 src/LanguageIdentification.CLD2.Native/Scripts/run-build.sh create mode 100644 src/LanguageIdentification.CLD2.Native/Scripts/setup-build.sh create mode 100644 src/LanguageIdentification.CLD2.Native/Scripts/setup-runtime.sh create mode 100644 src/LanguageIdentification.CLD2/CLD2Detector.cs create mode 100644 src/LanguageIdentification.CLD2/CLD2DetectorWrapper.cs create mode 100644 src/LanguageIdentification.CLD2/CLD2PredictionResult.cs create mode 100644 src/LanguageIdentification.CLD2/LanguageIdentification.CLD2.csproj diff --git a/src/LanguageIdentification.CLD2.ConsoleTest/Dockerfile b/src/LanguageIdentification.CLD2.ConsoleTest/Dockerfile new file mode 100644 index 0000000..0ae3d1c --- /dev/null +++ b/src/LanguageIdentification.CLD2.ConsoleTest/Dockerfile @@ -0,0 +1,35 @@ +FROM mcr.microsoft.com/dotnet/runtime:8.0 AS base +WORKDIR /app + +FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build +ARG BUILD_CONFIGURATION=Release +WORKDIR /repo +COPY ["src/LanguageIdentification.CLD2.ConsoleTest/LanguageIdentification.CLD2.ConsoleTest.csproj", "src/LanguageIdentification.CLD2.ConsoleTest/"] +COPY ["src/LanguageIdentification.CLD2.Native/LanguageIdentification.CLD2.Native.csproj", "src/LanguageIdentification.CLD2.Native/"] +COPY ["src/LanguageIdentification.CLD2/LanguageIdentification.CLD2.csproj", "src/LanguageIdentification.CLD2/"] + +### CLD2 +COPY ["src/LanguageIdentification.CLD2.Native/Scripts/setup-build.sh", "src/LanguageIdentification.CLD2.Native/Scripts/"] +RUN cd /repo/src/LanguageIdentification.CLD2.Native && bash -c ./Scripts/setup-build.sh +### + +RUN dotnet restore "./src/LanguageIdentification.CLD2.ConsoleTest/LanguageIdentification.CLD2.ConsoleTest.csproj" +COPY . . + +### CLD2 +RUN cd /repo/src/LanguageIdentification.CLD2.Native && bash -c ./Scripts/run-build.sh +### + +WORKDIR /repo/src/LanguageIdentification.CLD2.ConsoleTest +RUN dotnet build "./LanguageIdentification.CLD2.ConsoleTest.csproj" -c $BUILD_CONFIGURATION -o /app/build + +FROM build AS publish +ARG BUILD_CONFIGURATION=Release + +WORKDIR /repo/src/LanguageIdentification.CLD2.ConsoleTest +RUN dotnet publish "./LanguageIdentification.CLD2.ConsoleTest.csproj" -c $BUILD_CONFIGURATION -o /app/publish /p:UseAppHost=false + +FROM base AS final +WORKDIR /app +COPY --from=publish /app/publish . +ENTRYPOINT ["dotnet", "LanguageIdentification.CLD2.ConsoleTest.dll"] \ No newline at end of file diff --git a/src/LanguageIdentification.CLD2.ConsoleTest/LanguageIdentification.CLD2.ConsoleTest.csproj b/src/LanguageIdentification.CLD2.ConsoleTest/LanguageIdentification.CLD2.ConsoleTest.csproj new file mode 100644 index 0000000..3861fa6 --- /dev/null +++ b/src/LanguageIdentification.CLD2.ConsoleTest/LanguageIdentification.CLD2.ConsoleTest.csproj @@ -0,0 +1,28 @@ + + + + Exe + net8.0 + enable + enable + Linux + Regular + ..\.. + --name language-identification-cld2 + + + + + $(DockerDefaultDockerfile) + + + + + + + + + + + + diff --git a/src/LanguageIdentification.CLD2.ConsoleTest/Program.cs b/src/LanguageIdentification.CLD2.ConsoleTest/Program.cs new file mode 100644 index 0000000..e293512 --- /dev/null +++ b/src/LanguageIdentification.CLD2.ConsoleTest/Program.cs @@ -0,0 +1,21 @@ +namespace LanguageIdentification.CLD2.ConsoleTest +{ + internal class Program + { + static void Main(string[] args) + { + using var cld2 = new CLD2Detector(); + + string text = "Hello, how are you? Привіт, як справи? Привет, как дела?"; + + var topLangs = cld2.PredictLanguage(text); + + foreach (var lang in topLangs) + { + Console.WriteLine($"Language: {lang.Language}, Probability: {lang.Probability}, IsReliable: {lang.IsReliable}, Proportion: {lang.Proportion}"); + } + + ; + } + } +} diff --git a/src/LanguageIdentification.CLD2.ConsoleTest/Properties/launchSettings.json b/src/LanguageIdentification.CLD2.ConsoleTest/Properties/launchSettings.json new file mode 100644 index 0000000..65b8965 --- /dev/null +++ b/src/LanguageIdentification.CLD2.ConsoleTest/Properties/launchSettings.json @@ -0,0 +1,7 @@ +{ + "profiles": { + "Docker": { + "commandName": "Docker" + } + } +} \ No newline at end of file diff --git a/src/LanguageIdentification.CLD2.Native/.gitignore b/src/LanguageIdentification.CLD2.Native/.gitignore new file mode 100644 index 0000000..162500b --- /dev/null +++ b/src/LanguageIdentification.CLD2.Native/.gitignore @@ -0,0 +1,2 @@ +libcld2.so +build_temp/** diff --git a/src/LanguageIdentification.CLD2.Native/CLD2NativeLibrary.cs b/src/LanguageIdentification.CLD2.Native/CLD2NativeLibrary.cs new file mode 100644 index 0000000..f7f6204 --- /dev/null +++ b/src/LanguageIdentification.CLD2.Native/CLD2NativeLibrary.cs @@ -0,0 +1,14 @@ +using System.Runtime.InteropServices; + +namespace LanguageIdentification.CLD2.Native +{ + public static class CLD2NativeLibrary + { + public const string Name = "libcld2.so"; + + public static void LoadNativeLibrary() + { + NativeLibrary.Load(Name); + } + } +} diff --git a/src/LanguageIdentification.CLD2.Native/Dockerfile b/src/LanguageIdentification.CLD2.Native/Dockerfile new file mode 100644 index 0000000..3e7e769 --- /dev/null +++ b/src/LanguageIdentification.CLD2.Native/Dockerfile @@ -0,0 +1,16 @@ +FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build +ARG BUILD_CONFIGURATION=Release + +WORKDIR /repo +COPY ["src/LanguageIdentification.CLD2.Native/LanguageIdentification.CLD2.Native.csproj", "src/LanguageIdentification.CLD2.Native/"] +COPY ["src/LanguageIdentification.CLD2.Native/Scripts/setup-build.sh", "src/LanguageIdentification.CLD2.Native/Scripts/"] +COPY ["src/LanguageIdentification.CLD2.Native/Scripts/setup-runtime.sh", "src/LanguageIdentification.CLD2.Native/Scripts/"] +COPY ["src/LanguageIdentification.CLD2.Native/Scripts/run-build.sh", "src/LanguageIdentification.CLD2.Native/Scripts/"] +WORKDIR /repo/src/LanguageIdentification.CLD2.Native + +RUN bash ./Scripts/setup-runtime.sh +RUN bash ./Scripts/setup-build.sh + +COPY . . + +ENTRYPOINT ["bash", "./Scripts/run-build.sh"] diff --git a/src/LanguageIdentification.CLD2.Native/LanguageIdentification.CLD2.Native.csproj b/src/LanguageIdentification.CLD2.Native/LanguageIdentification.CLD2.Native.csproj new file mode 100644 index 0000000..8797823 --- /dev/null +++ b/src/LanguageIdentification.CLD2.Native/LanguageIdentification.CLD2.Native.csproj @@ -0,0 +1,52 @@ + + + + net8.0 + 0.0.0.1 + LanguageIdentification.CLD2.Native + LanguageIdentification.CLD2.Native + LanguageIdentification.CLD2.Native + Alexander Gluschenko + https://github.com/gluschenko/language-identification + enable + enable + true + + + + true + $(NoWarn);1591,1573 + ..\LanguageIdentification.CLD2.Native.xml + + + + cld2-builder + cld2-builder + $(MSBuildThisFileDirectory) + $([System.IO.Path]::Combine('$(MSBuildThisFileDirectory)', '../../third_party/cld2/')) + $([System.IO.Path]::Combine('$(MSBuildThisFileDirectory)', 'libcld2.so')) + + + + + + + + + + + $(DockerDefaultDockerfile) + + + + + + PreserveNewest + true + runtimes/linux-x64/native + true + false + + + + diff --git a/src/LanguageIdentification.CLD2.Native/Native/CMakeLists.txt b/src/LanguageIdentification.CLD2.Native/Native/CMakeLists.txt new file mode 100644 index 0000000..4d6a56b --- /dev/null +++ b/src/LanguageIdentification.CLD2.Native/Native/CMakeLists.txt @@ -0,0 +1,106 @@ +cmake_minimum_required(VERSION 2.8 FATAL_ERROR) +project(cld2_bridge) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") +if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") +endif() + +if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing") +elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267") +endif() + + +set(CMAKE_MACOSX_RPATH 1) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +include_directories( + ${PROJECT_SOURCE_DIR}/cld2/internal + ${PROJECT_SOURCE_DIR}/cld2/public +) + +set(CLD2_SOURCES + ${PROJECT_SOURCE_DIR}/cld2/internal/cldutil.cc + ${PROJECT_SOURCE_DIR}/cld2/internal/cldutil_shared.cc + ${PROJECT_SOURCE_DIR}/cld2/internal/compact_lang_det.cc + ${PROJECT_SOURCE_DIR}/cld2/internal/compact_lang_det_hint_code.cc + ${PROJECT_SOURCE_DIR}/cld2/internal/compact_lang_det_impl.cc + ${PROJECT_SOURCE_DIR}/cld2/internal/debug.cc + ${PROJECT_SOURCE_DIR}/cld2/internal/fixunicodevalue.cc + ${PROJECT_SOURCE_DIR}/cld2/internal/generated_entities.cc + ${PROJECT_SOURCE_DIR}/cld2/internal/generated_language.cc + ${PROJECT_SOURCE_DIR}/cld2/internal/generated_ulscript.cc + ${PROJECT_SOURCE_DIR}/cld2/internal/getonescriptspan.cc + ${PROJECT_SOURCE_DIR}/cld2/internal/lang_script.cc + ${PROJECT_SOURCE_DIR}/cld2/internal/offsetmap.cc + ${PROJECT_SOURCE_DIR}/cld2/internal/scoreonescriptspan.cc + ${PROJECT_SOURCE_DIR}/cld2/internal/tote.cc + ${PROJECT_SOURCE_DIR}/cld2/internal/utf8statetable.cc + ${PROJECT_SOURCE_DIR}/cld2/internal/generated_distinct_bi_0.cc + + ### Chrome (less perfect predictions) + # ${PROJECT_SOURCE_DIR}/cld2/internal/cld_generated_cjk_uni_prop_80.cc + # ${PROJECT_SOURCE_DIR}/cld2/internal/cld2_generated_cjk_compatible.cc + # ${PROJECT_SOURCE_DIR}/cld2/internal/cld_generated_cjk_delta_bi_4.cc + # ${PROJECT_SOURCE_DIR}/cld2/internal/generated_distinct_bi_0.cc + # ${PROJECT_SOURCE_DIR}/cld2/internal/cld2_generated_quadchrome_2.cc + # ${PROJECT_SOURCE_DIR}/cld2/internal/cld2_generated_deltaoctachrome.cc + # ${PROJECT_SOURCE_DIR}/cld2/internal/cld2_generated_distinctoctachrome.cc + # ${PROJECT_SOURCE_DIR}/cld2/internal/cld_generated_score_quad_octa_2.cc + ### + + ### Full + ${PROJECT_SOURCE_DIR}/cld2/internal/cld_generated_cjk_uni_prop_80.cc + ${PROJECT_SOURCE_DIR}/cld2/internal/cld2_generated_cjk_compatible.cc + ${PROJECT_SOURCE_DIR}/cld2/internal/cld_generated_cjk_delta_bi_32.cc + ${PROJECT_SOURCE_DIR}/cld2/internal/generated_distinct_bi_0.cc + ${PROJECT_SOURCE_DIR}/cld2/internal/cld2_generated_quad0122.cc + ${PROJECT_SOURCE_DIR}/cld2/internal/cld2_generated_deltaocta0122.cc + ${PROJECT_SOURCE_DIR}/cld2/internal/cld2_generated_distinctocta0122.cc + ${PROJECT_SOURCE_DIR}/cld2/internal/cld_generated_score_quad_octa_0122.cc + ### + + ${PROJECT_SOURCE_DIR}/cld2/internal/cld2_dynamic_compat.h + ${PROJECT_SOURCE_DIR}/cld2/internal/cld2_dynamic_data_extractor.h + ${PROJECT_SOURCE_DIR}/cld2/internal/cld2_dynamic_data.h + ${PROJECT_SOURCE_DIR}/cld2/internal/cld2_dynamic_data_loader.h + ${PROJECT_SOURCE_DIR}/cld2/internal/cld2tablesummary.h + ${PROJECT_SOURCE_DIR}/cld2/internal/cldutil.h + ${PROJECT_SOURCE_DIR}/cld2/internal/cldutil_offline.h + ${PROJECT_SOURCE_DIR}/cld2/internal/cldutil_shared.h + ${PROJECT_SOURCE_DIR}/cld2/internal/compact_lang_det_hint_code.h + ${PROJECT_SOURCE_DIR}/cld2/internal/compact_lang_det_impl.h + ${PROJECT_SOURCE_DIR}/cld2/internal/debug.h + ${PROJECT_SOURCE_DIR}/cld2/internal/fixunicodevalue.h + ${PROJECT_SOURCE_DIR}/cld2/internal/generated_language.h + ${PROJECT_SOURCE_DIR}/cld2/internal/generated_ulscript.h + ${PROJECT_SOURCE_DIR}/cld2/internal/getonescriptspan.h + ${PROJECT_SOURCE_DIR}/cld2/internal/integral_types.h + ${PROJECT_SOURCE_DIR}/cld2/internal/lang_script.h + ${PROJECT_SOURCE_DIR}/cld2/internal/langspan.h + ${PROJECT_SOURCE_DIR}/cld2/internal/offsetmap.h + ${PROJECT_SOURCE_DIR}/cld2/internal/port.h + ${PROJECT_SOURCE_DIR}/cld2/internal/scoreonescriptspan.h + ${PROJECT_SOURCE_DIR}/cld2/internal/stringpiece.h + ${PROJECT_SOURCE_DIR}/cld2/internal/tote.h + ${PROJECT_SOURCE_DIR}/cld2/internal/unittest_data.h + ${PROJECT_SOURCE_DIR}/cld2/internal/utf8acceptinterchange.h + ${PROJECT_SOURCE_DIR}/cld2/internal/utf8prop_lettermarkscriptnum.h + ${PROJECT_SOURCE_DIR}/cld2/internal/utf8repl_lettermarklower.h + ${PROJECT_SOURCE_DIR}/cld2/internal/utf8scannot_lettermarkspecial.h + ${PROJECT_SOURCE_DIR}/cld2/internal/utf8statetable.h + ${PROJECT_SOURCE_DIR}/cld2/public/compact_lang_det.h + ${PROJECT_SOURCE_DIR}/cld2/public/encodings.h + + # bindings + ${PROJECT_SOURCE_DIR}/binding.cc + ${PROJECT_SOURCE_DIR}/binding.h +) + +add_library(objlib OBJECT ${CLD2_SOURCES}) + +add_library(cld2 SHARED $) + +set_target_properties(cld2 PROPERTIES PUBLIC_HEADER "${PROJECT_SOURCE_DIR}/cld2/binding.h") \ No newline at end of file diff --git a/src/LanguageIdentification.CLD2.Native/Native/binding.cc b/src/LanguageIdentification.CLD2.Native/Native/binding.cc new file mode 100644 index 0000000..e8d105d --- /dev/null +++ b/src/LanguageIdentification.CLD2.Native/Native/binding.cc @@ -0,0 +1,77 @@ +#include +#include +#include +#include +#include "./cld2/public/compact_lang_det.h" +#include "binding.h" + +#define MAX_LANGUAGE_COUNT 3 + +extern "C" +{ + PredictionResult* PredictLanguage(char *data, int length, int* resultCount) + { + bool is_plain_text = true; + CLD2::CLDHints cldhints = {NULL, NULL, 0, CLD2::UNKNOWN_LANGUAGE}; + bool allow_extended_lang = true; + int flags = 0; + CLD2::Language language3[MAX_LANGUAGE_COUNT]; + int percent3[MAX_LANGUAGE_COUNT]; + double normalized_score3[MAX_LANGUAGE_COUNT]; + CLD2::ResultChunkVector result_chunk_vector; + int text_bytes; + bool is_reliable; + + if (length <= 0) + { + length = strlen(data); + } + + CLD2::Language summary_lang = CLD2::UNKNOWN_LANGUAGE; + + summary_lang = CLD2::ExtDetectLanguageSummary( + data, + length, + is_plain_text, + &cldhints, + flags, + language3, + percent3, + normalized_score3, + &result_chunk_vector, + &text_bytes, + &is_reliable); + + int a = 0; + + for (int i = 0; i < MAX_LANGUAGE_COUNT; ++i) + { + // if (percent3[i] > 0) + { + a++; + } + } + + *resultCount = a; + + PredictionResult* result = new PredictionResult[*resultCount]; + for (int i = 0; i < *resultCount; ++i) { + result[i].language = CLD2::LanguageCode(language3[i]); + result[i].script = CLD2::ULScriptCode(CLD2::LanguageRecognizedScript(language3[i], 0)); + result[i].probability = normalized_score3[i]; + result[i].is_reliable = is_reliable; + result[i].proportion = percent3[i]; + } + + return result; + } + + void FreeResults(PredictionResult* results, int count) + { + for (int i = 0; i < count; ++i) { + free((void*)results[i].language); + free((void*)results[i].script); + } + // delete[] results; + } +} \ No newline at end of file diff --git a/src/LanguageIdentification.CLD2.Native/Native/binding.h b/src/LanguageIdentification.CLD2.Native/Native/binding.h new file mode 100644 index 0000000..4272ccd --- /dev/null +++ b/src/LanguageIdentification.CLD2.Native/Native/binding.h @@ -0,0 +1,28 @@ +#pragma once + +#include +#include +#include "./cld2/public/compact_lang_det.h" + +#ifndef EXPORT +#if defined(_WIN32) || defined(_WIN64) +#define EXPORT __declspec(dllimport) +#else +#define EXPORT extern +#endif +#endif + +extern "C" +{ + struct PredictionResult { + const char* language; + const char* script; + double probability; + bool is_reliable; + double proportion; + }; + + EXPORT PredictionResult* PredictLanguage(char *data, int length, int* resultCount); + + EXPORT void FreeResults(PredictionResult* results, int count); +} diff --git a/src/LanguageIdentification.CLD2.Native/Scripts/run-build.sh b/src/LanguageIdentification.CLD2.Native/Scripts/run-build.sh new file mode 100644 index 0000000..bf15a88 --- /dev/null +++ b/src/LanguageIdentification.CLD2.Native/Scripts/run-build.sh @@ -0,0 +1,30 @@ +#!/bin/bash +set -e + +echo "Hello world"; + +workspace="build_temp" + +mkdir "$workspace" -p +cp -a ../../third_party/cld2/. $workspace/cld2 +cp -a Native/. $workspace + +ls -R . + +cd "$workspace" + +mkdir build +cd build +cmake .. +make -j $(nproc) # make +cd .. + +echo $(pwd) +ls -R build +cd .. + +find "$workspace/build" -name "libcld2.so" -exec cp {} libcld2.so \; +rm -rf "$workspace" +ldd libcld2.so + +echo "Goodbye world"; diff --git a/src/LanguageIdentification.CLD2.Native/Scripts/setup-build.sh b/src/LanguageIdentification.CLD2.Native/Scripts/setup-build.sh new file mode 100644 index 0000000..c91276b --- /dev/null +++ b/src/LanguageIdentification.CLD2.Native/Scripts/setup-build.sh @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +echo "Installing build packages"; + +sudo apt -y update | apt -y update +sudo apt -y install cmake | apt -y install cmake +sudo apt -y install g++ | apt -y install g++ diff --git a/src/LanguageIdentification.CLD2.Native/Scripts/setup-runtime.sh b/src/LanguageIdentification.CLD2.Native/Scripts/setup-runtime.sh new file mode 100644 index 0000000..1ae5c1d --- /dev/null +++ b/src/LanguageIdentification.CLD2.Native/Scripts/setup-runtime.sh @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +echo "Installing runtime packages"; + diff --git a/src/LanguageIdentification.CLD2/CLD2Detector.cs b/src/LanguageIdentification.CLD2/CLD2Detector.cs new file mode 100644 index 0000000..733f9b5 --- /dev/null +++ b/src/LanguageIdentification.CLD2/CLD2Detector.cs @@ -0,0 +1,57 @@ +using System.Runtime.InteropServices; + +namespace LanguageIdentification.CLD2; + +public class CLD2Detector : IDisposable +{ + private readonly SemaphoreSlim _semaphore; + + public CLD2Detector() + { + _semaphore = new(1, 1); + } + + public IEnumerable PredictLanguage(string text) + { + try + { + _semaphore.Wait(); + + var textLength = text.Length; + var textPointer = Marshal.StringToHGlobalUni(text); + + var resultPtr = CLD2DetectorWrapper.PredictLanguage( + data: textPointer, + length: textLength, + resultCount: out var resultCount + ); + + try + { + var result = new CLD2PredictionResult[resultCount]; + var structSize = Marshal.SizeOf(typeof(CLD2PredictionResult)); + + for (var i = 0; i < resultCount; i++) + { + result[i] = Marshal.PtrToStructure(resultPtr + i * structSize); + } + + return result; + } + finally + { + CLD2DetectorWrapper.FreeResults(resultPtr, resultCount); + Marshal.FreeHGlobal(textPointer); + } + } + finally + { + _semaphore.Release(); + } + } + + public void Dispose() + { + GC.SuppressFinalize(this); + } +} diff --git a/src/LanguageIdentification.CLD2/CLD2DetectorWrapper.cs b/src/LanguageIdentification.CLD2/CLD2DetectorWrapper.cs new file mode 100644 index 0000000..cdab280 --- /dev/null +++ b/src/LanguageIdentification.CLD2/CLD2DetectorWrapper.cs @@ -0,0 +1,13 @@ +using System.Runtime.InteropServices; +using LanguageIdentification.CLD2.Native; + +namespace LanguageIdentification.CLD2; + +internal static class CLD2DetectorWrapper +{ + [DllImport(CLD2NativeLibrary.Name, CallingConvention = CallingConvention.Cdecl)] + public static extern nint PredictLanguage(IntPtr data, int length, out int resultCount); + + [DllImport(CLD2NativeLibrary.Name, CallingConvention = CallingConvention.Cdecl)] + public static extern void FreeResults(nint results, int count); +} diff --git a/src/LanguageIdentification.CLD2/CLD2PredictionResult.cs b/src/LanguageIdentification.CLD2/CLD2PredictionResult.cs new file mode 100644 index 0000000..4888357 --- /dev/null +++ b/src/LanguageIdentification.CLD2/CLD2PredictionResult.cs @@ -0,0 +1,22 @@ +using System.Runtime.InteropServices; + +namespace LanguageIdentification.CLD2; + +[StructLayout(LayoutKind.Sequential, CharSet = CharSet.Ansi)] +public readonly struct CLD2PredictionResult +{ + [MarshalAs(UnmanagedType.LPStr)] + public readonly string Language; + + [MarshalAs(UnmanagedType.LPStr)] + public readonly string Script; + + [MarshalAs(UnmanagedType.R8)] + public readonly double Probability; + + [MarshalAs(UnmanagedType.I1)] + public readonly bool IsReliable; + + [MarshalAs(UnmanagedType.R8)] + public readonly double Proportion; +} diff --git a/src/LanguageIdentification.CLD2/LanguageIdentification.CLD2.csproj b/src/LanguageIdentification.CLD2/LanguageIdentification.CLD2.csproj new file mode 100644 index 0000000..33c6d6e --- /dev/null +++ b/src/LanguageIdentification.CLD2/LanguageIdentification.CLD2.csproj @@ -0,0 +1,28 @@ + + + + net8.0 + 0.0.0.1 + LanguageIdentification.CLD2 + LanguageIdentification.CLD2 + LanguageIdentification.CLD2 + Alexander Gluschenko + https://github.com/gluschenko/language-identification + enable + enable + true + + + + true + $(NoWarn);1591,1573 + ..\LanguageIdentification.CLD2.xml + + + + + All + + + + diff --git a/src/LanguageIdentification.sln b/src/LanguageIdentification.sln index 0263439..964088d 100644 --- a/src/LanguageIdentification.sln +++ b/src/LanguageIdentification.sln @@ -32,11 +32,19 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "cld3", "cld3", "{F54C81D9-9 EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "fasttext", "fasttext", "{FCC39A16-91D3-48C6-BEBE-4CFB4CA6A365}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LanguageIdentification.FastText", "LanguageIdentification.FastText\LanguageIdentification.FastText.csproj", "{9310BAF6-084A-43C8-8949-B84A29B67A2A}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LanguageIdentification.FastText", "LanguageIdentification.FastText\LanguageIdentification.FastText.csproj", "{9310BAF6-084A-43C8-8949-B84A29B67A2A}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LanguageIdentification.FastText.ConsoleTest", "LanguageIdentification.FastText.ConsoleTest\LanguageIdentification.FastText.ConsoleTest.csproj", "{F9061126-EC76-4405-B8F4-ADBF436B66D6}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LanguageIdentification.FastText.ConsoleTest", "LanguageIdentification.FastText.ConsoleTest\LanguageIdentification.FastText.ConsoleTest.csproj", "{F9061126-EC76-4405-B8F4-ADBF436B66D6}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LanguageIdentification.FastText.Native", "LanguageIdentification.FastText.Native\LanguageIdentification.FastText.Native.csproj", "{B6A6035E-AB74-4188-87D1-176C160A60B0}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LanguageIdentification.FastText.Native", "LanguageIdentification.FastText.Native\LanguageIdentification.FastText.Native.csproj", "{B6A6035E-AB74-4188-87D1-176C160A60B0}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "cld2", "cld2", "{33499C07-9A27-4AD7-9855-F01EAEC392CD}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LanguageIdentification.CLD2", "LanguageIdentification.CLD2\LanguageIdentification.CLD2.csproj", "{933AE320-DCC7-483D-8D3A-DC79A30CE275}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LanguageIdentification.CLD2.Native", "LanguageIdentification.CLD2.Native\LanguageIdentification.CLD2.Native.csproj", "{FA0388BE-DF78-41D4-A09D-9EE1DED77097}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LanguageIdentification.CLD2.ConsoleTest", "LanguageIdentification.CLD2.ConsoleTest\LanguageIdentification.CLD2.ConsoleTest.csproj", "{7FD7BC00-00A2-4391-94D1-CD848C5DD471}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution @@ -72,6 +80,18 @@ Global {B6A6035E-AB74-4188-87D1-176C160A60B0}.Debug|Any CPU.Build.0 = Debug|Any CPU {B6A6035E-AB74-4188-87D1-176C160A60B0}.Release|Any CPU.ActiveCfg = Release|Any CPU {B6A6035E-AB74-4188-87D1-176C160A60B0}.Release|Any CPU.Build.0 = Release|Any CPU + {933AE320-DCC7-483D-8D3A-DC79A30CE275}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {933AE320-DCC7-483D-8D3A-DC79A30CE275}.Debug|Any CPU.Build.0 = Debug|Any CPU + {933AE320-DCC7-483D-8D3A-DC79A30CE275}.Release|Any CPU.ActiveCfg = Release|Any CPU + {933AE320-DCC7-483D-8D3A-DC79A30CE275}.Release|Any CPU.Build.0 = Release|Any CPU + {FA0388BE-DF78-41D4-A09D-9EE1DED77097}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {FA0388BE-DF78-41D4-A09D-9EE1DED77097}.Debug|Any CPU.Build.0 = Debug|Any CPU + {FA0388BE-DF78-41D4-A09D-9EE1DED77097}.Release|Any CPU.ActiveCfg = Release|Any CPU + {FA0388BE-DF78-41D4-A09D-9EE1DED77097}.Release|Any CPU.Build.0 = Release|Any CPU + {7FD7BC00-00A2-4391-94D1-CD848C5DD471}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {7FD7BC00-00A2-4391-94D1-CD848C5DD471}.Debug|Any CPU.Build.0 = Debug|Any CPU + {7FD7BC00-00A2-4391-94D1-CD848C5DD471}.Release|Any CPU.ActiveCfg = Release|Any CPU + {7FD7BC00-00A2-4391-94D1-CD848C5DD471}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -83,6 +103,9 @@ Global {9310BAF6-084A-43C8-8949-B84A29B67A2A} = {FCC39A16-91D3-48C6-BEBE-4CFB4CA6A365} {F9061126-EC76-4405-B8F4-ADBF436B66D6} = {FCC39A16-91D3-48C6-BEBE-4CFB4CA6A365} {B6A6035E-AB74-4188-87D1-176C160A60B0} = {FCC39A16-91D3-48C6-BEBE-4CFB4CA6A365} + {933AE320-DCC7-483D-8D3A-DC79A30CE275} = {33499C07-9A27-4AD7-9855-F01EAEC392CD} + {FA0388BE-DF78-41D4-A09D-9EE1DED77097} = {33499C07-9A27-4AD7-9855-F01EAEC392CD} + {7FD7BC00-00A2-4391-94D1-CD848C5DD471} = {33499C07-9A27-4AD7-9855-F01EAEC392CD} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {C9437BCA-E720-4E50-AD1B-6AC91C4A4E5C} From efc7ca622225b34c85a9cf88ca48476f5685e32a Mon Sep 17 00:00:00 2001 From: Alexander Gluschenko Date: Thu, 4 Jul 2024 21:28:36 +0400 Subject: [PATCH 3/7] CLD3: refactoring --- .../Native/bind.cc | 20 +++++++++---------- .../Native/bind.h | 8 ++++---- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/LanguageIdentification.CLD3.Native/Native/bind.cc b/src/LanguageIdentification.CLD3.Native/Native/bind.cc index a0de06c..f82f3d5 100644 --- a/src/LanguageIdentification.CLD3.Native/Native/bind.cc +++ b/src/LanguageIdentification.CLD3.Native/Native/bind.cc @@ -11,11 +11,11 @@ void FreeIdentifier(void* identifier) { delete static_cast(identifier); } -Result FindLanguage(void* identifier, const char* text) { +PredictionResult FindLanguage(void* identifier, const char* text) { NNetLanguageIdentifier* nativeIdentifier = static_cast(identifier); auto nativeResult = nativeIdentifier->FindLanguage(text); - Result result; + PredictionResult result; result.language = strdup(nativeResult.language.c_str()); result.probability = nativeResult.probability; result.is_reliable = nativeResult.is_reliable; @@ -23,22 +23,22 @@ Result FindLanguage(void* identifier, const char* text) { return result; } -Result* FindTopNMostFreqLangs(void* identifier, const char* text, int numLangs, int* resultCount) { +PredictionResult* FindTopNMostFreqLangs(void* identifier, const char* text, int numLangs, int* resultCount) { NNetLanguageIdentifier* nativeIdentifier = static_cast(identifier); auto nativeResults = nativeIdentifier->FindTopNMostFreqLangs(text, numLangs); *resultCount = static_cast(nativeResults.size()); - Result* results = new Result[*resultCount]; + PredictionResult* result = new PredictionResult[*resultCount]; for (int i = 0; i < *resultCount; ++i) { - results[i].language = strdup(nativeResults[i].language.c_str()); - results[i].probability = nativeResults[i].probability; - results[i].is_reliable = nativeResults[i].is_reliable; - results[i].proportion = nativeResults[i].proportion; + result[i].language = strdup(nativeResults[i].language.c_str()); + result[i].probability = nativeResults[i].probability; + result[i].is_reliable = nativeResults[i].is_reliable; + result[i].proportion = nativeResults[i].proportion; } - return results; + return result; } -void FreeResults(Result* results, int count) { +void FreeResults(PredictionResult* results, int count) { for (int i = 0; i < count; ++i) { free((void*)results[i].language); } diff --git a/src/LanguageIdentification.CLD3.Native/Native/bind.h b/src/LanguageIdentification.CLD3.Native/Native/bind.h index 3041778..3e19b60 100644 --- a/src/LanguageIdentification.CLD3.Native/Native/bind.h +++ b/src/LanguageIdentification.CLD3.Native/Native/bind.h @@ -18,7 +18,7 @@ using namespace std; #endif extern "C" { - struct Result { + struct PredictionResult { const char* language; double probability; bool is_reliable; @@ -27,7 +27,7 @@ extern "C" { EXPORT void* CreateIdentifier(int minNumBytes, int maxNumBytes); EXPORT void FreeIdentifier(void* identifier); - EXPORT Result FindLanguage(void* identifier, const char* text); - EXPORT Result* FindTopNMostFreqLangs(void* identifier, const char* text, int numLangs, int* resultCount); - EXPORT void FreeResults(Result* results, int count); + EXPORT PredictionResult FindLanguage(void* identifier, const char* text); + EXPORT PredictionResult* FindTopNMostFreqLangs(void* identifier, const char* text, int numLangs, int* resultCount); + EXPORT void FreeResults(PredictionResult* results, int count); } \ No newline at end of file From 6a8ebf8d0b69f54aae884f884e8952959af65fd8 Mon Sep 17 00:00:00 2001 From: Alexander Gluschenko Date: Thu, 4 Jul 2024 21:30:25 +0400 Subject: [PATCH 4/7] Update github-ci.yml --- .github/workflows/github-ci.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/github-ci.yml b/.github/workflows/github-ci.yml index 6dc78f8..86a92e4 100644 --- a/.github/workflows/github-ci.yml +++ b/.github/workflows/github-ci.yml @@ -16,6 +16,8 @@ jobs: fail-fast: false matrix: projectName: + - LanguageIdentification.CLD2.Native + - LanguageIdentification.CLD2 - LanguageIdentification.CLD3.Native - LanguageIdentification.CLD3 - LanguageIdentification.FastText.Native @@ -61,6 +63,8 @@ jobs: fail-fast: false matrix: projectName: + - LanguageIdentification.CLD2.Native + - LanguageIdentification.CLD2 - LanguageIdentification.CLD3.Native - LanguageIdentification.CLD3 - LanguageIdentification.FastText.Native From 3a8254cda927e5283c9f1e4aed378078d308fe48 Mon Sep 17 00:00:00 2001 From: Alexander Gluschenko Date: Thu, 4 Jul 2024 21:33:17 +0400 Subject: [PATCH 5/7] CLD2: using strdup() to copy string in result --- src/LanguageIdentification.CLD2.Native/Native/binding.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/LanguageIdentification.CLD2.Native/Native/binding.cc b/src/LanguageIdentification.CLD2.Native/Native/binding.cc index e8d105d..948c23c 100644 --- a/src/LanguageIdentification.CLD2.Native/Native/binding.cc +++ b/src/LanguageIdentification.CLD2.Native/Native/binding.cc @@ -56,8 +56,8 @@ extern "C" PredictionResult* result = new PredictionResult[*resultCount]; for (int i = 0; i < *resultCount; ++i) { - result[i].language = CLD2::LanguageCode(language3[i]); - result[i].script = CLD2::ULScriptCode(CLD2::LanguageRecognizedScript(language3[i], 0)); + result[i].language = strdup(CLD2::LanguageCode(language3[i])); + result[i].script = strdup(CLD2::ULScriptCode(CLD2::LanguageRecognizedScript(language3[i], 0))); result[i].probability = normalized_score3[i]; result[i].is_reliable = is_reliable; result[i].proportion = percent3[i]; @@ -72,6 +72,6 @@ extern "C" free((void*)results[i].language); free((void*)results[i].script); } - // delete[] results; + delete[] results; } } \ No newline at end of file From 694285fd5aa3aa97e211d5c0c66c3279b30580d2 Mon Sep 17 00:00:00 2001 From: Alexander Gluschenko Date: Thu, 4 Jul 2024 23:08:46 +0400 Subject: [PATCH 6/7] CLD2: encoding fixes + probability calculation --- .../Native/binding.cc | 57 +++++++++++-------- .../Native/binding.h | 2 +- .../CLD2Detector.cs | 9 +-- .../CLD2DetectorWrapper.cs | 2 +- 4 files changed, 37 insertions(+), 33 deletions(-) diff --git a/src/LanguageIdentification.CLD2.Native/Native/binding.cc b/src/LanguageIdentification.CLD2.Native/Native/binding.cc index 948c23c..4320703 100644 --- a/src/LanguageIdentification.CLD2.Native/Native/binding.cc +++ b/src/LanguageIdentification.CLD2.Native/Native/binding.cc @@ -1,4 +1,4 @@ -#include +#include #include #include #include @@ -9,58 +9,67 @@ extern "C" { - PredictionResult* PredictLanguage(char *data, int length, int* resultCount) + PredictionResult* PredictLanguage(char *text, int* resultCount) { + int textLength = strlen(text); + bool is_plain_text = true; CLD2::CLDHints cldhints = {NULL, NULL, 0, CLD2::UNKNOWN_LANGUAGE}; bool allow_extended_lang = true; int flags = 0; - CLD2::Language language3[MAX_LANGUAGE_COUNT]; - int percent3[MAX_LANGUAGE_COUNT]; - double normalized_score3[MAX_LANGUAGE_COUNT]; CLD2::ResultChunkVector result_chunk_vector; int text_bytes; bool is_reliable; - if (length <= 0) - { - length = strlen(data); - } + CLD2::Language languages[MAX_LANGUAGE_COUNT]; + int percents[MAX_LANGUAGE_COUNT]; + double scores[MAX_LANGUAGE_COUNT]; CLD2::Language summary_lang = CLD2::UNKNOWN_LANGUAGE; summary_lang = CLD2::ExtDetectLanguageSummary( - data, - length, + text, + textLength, is_plain_text, &cldhints, flags, - language3, - percent3, - normalized_score3, + languages, + percents, + scores, &result_chunk_vector, &text_bytes, &is_reliable); - int a = 0; + int predictionCount = 0; for (int i = 0; i < MAX_LANGUAGE_COUNT; ++i) { - // if (percent3[i] > 0) + if (percents[i] > 0 || (i == 0 && languages[i] == CLD2::UNKNOWN_LANGUAGE)) { - a++; + predictionCount++; } } - *resultCount = a; + *resultCount = predictionCount; + + int scoreTotal = 0; + + for (int i = 0; i < predictionCount; ++i) { + scoreTotal += scores[i]; + } + + PredictionResult* result = new PredictionResult[predictionCount]; + for (int i = 0; i < predictionCount; ++i) { + + CLD2::Language language = languages[i]; + double probability = scoreTotal > 0 ? scores[i] / (double)scoreTotal : 1.0; + double proportion = percents[i] / 100.0; - PredictionResult* result = new PredictionResult[*resultCount]; - for (int i = 0; i < *resultCount; ++i) { - result[i].language = strdup(CLD2::LanguageCode(language3[i])); - result[i].script = strdup(CLD2::ULScriptCode(CLD2::LanguageRecognizedScript(language3[i], 0))); - result[i].probability = normalized_score3[i]; + result[i].language = strdup(CLD2::LanguageCode(language)); + result[i].script = strdup(CLD2::ULScriptCode(CLD2::LanguageRecognizedScript(language, 0))); + result[i].probability = probability; result[i].is_reliable = is_reliable; - result[i].proportion = percent3[i]; + result[i].proportion = proportion; } return result; diff --git a/src/LanguageIdentification.CLD2.Native/Native/binding.h b/src/LanguageIdentification.CLD2.Native/Native/binding.h index 4272ccd..f221d84 100644 --- a/src/LanguageIdentification.CLD2.Native/Native/binding.h +++ b/src/LanguageIdentification.CLD2.Native/Native/binding.h @@ -22,7 +22,7 @@ extern "C" double proportion; }; - EXPORT PredictionResult* PredictLanguage(char *data, int length, int* resultCount); + EXPORT PredictionResult* PredictLanguage(char* text, int* resultCount); EXPORT void FreeResults(PredictionResult* results, int count); } diff --git a/src/LanguageIdentification.CLD2/CLD2Detector.cs b/src/LanguageIdentification.CLD2/CLD2Detector.cs index 733f9b5..484da6a 100644 --- a/src/LanguageIdentification.CLD2/CLD2Detector.cs +++ b/src/LanguageIdentification.CLD2/CLD2Detector.cs @@ -17,12 +17,8 @@ public IEnumerable PredictLanguage(string text) { _semaphore.Wait(); - var textLength = text.Length; - var textPointer = Marshal.StringToHGlobalUni(text); - var resultPtr = CLD2DetectorWrapper.PredictLanguage( - data: textPointer, - length: textLength, + text: text, resultCount: out var resultCount ); @@ -36,12 +32,11 @@ public IEnumerable PredictLanguage(string text) result[i] = Marshal.PtrToStructure(resultPtr + i * structSize); } - return result; + return result.OrderByDescending(x => x.Probability).ToArray(); } finally { CLD2DetectorWrapper.FreeResults(resultPtr, resultCount); - Marshal.FreeHGlobal(textPointer); } } finally diff --git a/src/LanguageIdentification.CLD2/CLD2DetectorWrapper.cs b/src/LanguageIdentification.CLD2/CLD2DetectorWrapper.cs index cdab280..b68a908 100644 --- a/src/LanguageIdentification.CLD2/CLD2DetectorWrapper.cs +++ b/src/LanguageIdentification.CLD2/CLD2DetectorWrapper.cs @@ -6,7 +6,7 @@ namespace LanguageIdentification.CLD2; internal static class CLD2DetectorWrapper { [DllImport(CLD2NativeLibrary.Name, CallingConvention = CallingConvention.Cdecl)] - public static extern nint PredictLanguage(IntPtr data, int length, out int resultCount); + public static extern nint PredictLanguage(string text, out int resultCount); [DllImport(CLD2NativeLibrary.Name, CallingConvention = CallingConvention.Cdecl)] public static extern void FreeResults(nint results, int count); From 2085387dbc46e684778bfb3fdff02b4fc6b9046c Mon Sep 17 00:00:00 2001 From: Alexander Gluschenko Date: Thu, 4 Jul 2024 23:33:19 +0400 Subject: [PATCH 7/7] bind -> binding --- .../Native/{bind.cc => binding.cc} | 2 +- .../Native/{bind.h => binding.h} | 0 src/LanguageIdentification.CLD3.Native/Native/setup.py | 2 +- src/LanguageIdentification.CLD3.Native/Scripts/run-build.sh | 4 ++-- .../Native/CMakeLists.txt | 4 ++-- .../Native/{bind.cc => binding.cc} | 2 +- .../Native/{bind.h => binding.h} | 0 7 files changed, 7 insertions(+), 7 deletions(-) rename src/LanguageIdentification.CLD3.Native/Native/{bind.cc => binding.cc} (98%) rename src/LanguageIdentification.CLD3.Native/Native/{bind.h => binding.h} (100%) rename src/LanguageIdentification.FastText.Native/Native/{bind.cc => binding.cc} (99%) rename src/LanguageIdentification.FastText.Native/Native/{bind.h => binding.h} (100%) diff --git a/src/LanguageIdentification.CLD3.Native/Native/bind.cc b/src/LanguageIdentification.CLD3.Native/Native/binding.cc similarity index 98% rename from src/LanguageIdentification.CLD3.Native/Native/bind.cc rename to src/LanguageIdentification.CLD3.Native/Native/binding.cc index f82f3d5..d5cda6c 100644 --- a/src/LanguageIdentification.CLD3.Native/Native/bind.cc +++ b/src/LanguageIdentification.CLD3.Native/Native/binding.cc @@ -1,4 +1,4 @@ -#include "bind.h" +#include "binding.h" #include using namespace chrome_lang_id; diff --git a/src/LanguageIdentification.CLD3.Native/Native/bind.h b/src/LanguageIdentification.CLD3.Native/Native/binding.h similarity index 100% rename from src/LanguageIdentification.CLD3.Native/Native/bind.h rename to src/LanguageIdentification.CLD3.Native/Native/binding.h diff --git a/src/LanguageIdentification.CLD3.Native/Native/setup.py b/src/LanguageIdentification.CLD3.Native/Native/setup.py index 44e41c0..f51003c 100644 --- a/src/LanguageIdentification.CLD3.Native/Native/setup.py +++ b/src/LanguageIdentification.CLD3.Native/Native/setup.py @@ -50,7 +50,7 @@ 'src/cld_3/protos/sentence.pb.cc', 'src/cld_3/protos/task_spec.pb.cc', # CUSTOM - 'src/bind.cc', + 'src/binding.cc', ] diff --git a/src/LanguageIdentification.CLD3.Native/Scripts/run-build.sh b/src/LanguageIdentification.CLD3.Native/Scripts/run-build.sh index 5c64822..2b9e61a 100644 --- a/src/LanguageIdentification.CLD3.Native/Scripts/run-build.sh +++ b/src/LanguageIdentification.CLD3.Native/Scripts/run-build.sh @@ -11,8 +11,8 @@ ls -R . rm "$workspace/setup.py" cp Native/setup.py "$workspace/setup.py" -cp Native/bind.cc "$workspace/src/bind.cc" -cp Native/bind.h "$workspace/src/bind.h" +cp Native/binding.cc "$workspace/src/binding.cc" +cp Native/binding.h "$workspace/src/binding.h" cd "$workspace" export PYTHONPATH=$(pwd)/site-packages diff --git a/src/LanguageIdentification.FastText.Native/Native/CMakeLists.txt b/src/LanguageIdentification.FastText.Native/Native/CMakeLists.txt index fbed972..0fe2502 100644 --- a/src/LanguageIdentification.FastText.Native/Native/CMakeLists.txt +++ b/src/LanguageIdentification.FastText.Native/Native/CMakeLists.txt @@ -17,11 +17,11 @@ include_directories( ${PROJECT_SOURCE_DIR}/include) -set(SOURCES ${PROJECT_SOURCE_DIR}/bind.cc) +set(SOURCES ${PROJECT_SOURCE_DIR}/binding.cc) add_library(objlib OBJECT ${SOURCES}) add_library(fasttext SHARED $) -set_target_properties(fasttext PROPERTIES PUBLIC_HEADER ${PROJECT_SOURCE_DIR}/bind.h) +set_target_properties(fasttext PROPERTIES PUBLIC_HEADER ${PROJECT_SOURCE_DIR}/binding.h) target_link_libraries(fasttext fasttext-static_pic) diff --git a/src/LanguageIdentification.FastText.Native/Native/bind.cc b/src/LanguageIdentification.FastText.Native/Native/binding.cc similarity index 99% rename from src/LanguageIdentification.FastText.Native/Native/bind.cc rename to src/LanguageIdentification.FastText.Native/Native/binding.cc index 59b2506..0b6b487 100644 --- a/src/LanguageIdentification.FastText.Native/Native/bind.cc +++ b/src/LanguageIdentification.FastText.Native/Native/binding.cc @@ -6,7 +6,7 @@ #include "fasttext.h" #include "autotune.h" -#include "bind.h" +#include "binding.h" using namespace std; using namespace fasttext; diff --git a/src/LanguageIdentification.FastText.Native/Native/bind.h b/src/LanguageIdentification.FastText.Native/Native/binding.h similarity index 100% rename from src/LanguageIdentification.FastText.Native/Native/bind.h rename to src/LanguageIdentification.FastText.Native/Native/binding.h