Skip to content

Commit

Permalink
Merge branch 'topic/vadim/gcperf' into 'master'
Browse files Browse the repository at this point in the history
Performance improvements of grapheme cluster iterator

See merge request eng/ide/VSS!320
  • Loading branch information
godunko committed Mar 12, 2024
2 parents 50318b0 + 7855b6d commit f4cb759
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 111 deletions.
2 changes: 1 addition & 1 deletion data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ curl -o emoji/emoji-test.txt https://www.unicode.org/Public/emoji/15.1/emoji-tes
curl -o emoji/emoji-zwj-sequences.txt https://www.unicode.org/Public/emoji/15.1/emoji-zwj-sequences.txt
git clone https://github.com/nigeltao/parse-number-fxx-test-data
rm -rf parse-number-fxx-test-data/.git
git close https://github.com/json5/json5-tests.git
git clone https://github.com/json5/json5-tests.git
rm -rf json5-tests/.git
curl -O https://raw.githubusercontent.com/Perl/perl5/blead/t/re/re_tests
tar caf ../vss-tests-data.tar.bz2 .
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
-- Generic implementation of the string which use UTF-8 encoding for data.

with Ada.Unchecked_Deallocation;
with Interfaces;

with VSS.Implementation.GCC;
with VSS.Implementation.Line_Iterators;
with VSS.Implementation.String_Configuration;

Expand Down Expand Up @@ -1832,44 +1834,34 @@ package body VSS.Implementation.UTF8_String_Handlers is
end if;

declare
Code : constant VSS.Unicode.UTF8_Code_Unit :=
Storage (Position.UTF8_Offset);

begin
case Code is
when 16#00# .. 16#7F# =>
Position.UTF8_Offset := Position.UTF8_Offset + 1;
Position.UTF16_Offset := Position.UTF16_Offset + 1;
use type Interfaces.Integer_32;
use type VSS.Unicode.UTF8_Code_Unit;

when 16#C2# .. 16#DF# =>
Position.UTF8_Offset := Position.UTF8_Offset + 2;
Position.UTF16_Offset := Position.UTF16_Offset + 1;
-- This code is based on the fact that starting byte of the
-- multibyte sequence in UTF-8 has N most significant bits set to
-- one followed by zero bit. So, first byte is negated and number
-- of leading zero bits is counting.

when 16#E0# .. 16#EF# =>
Position.UTF8_Offset := Position.UTF8_Offset + 3;
Position.UTF16_Offset := Position.UTF16_Offset + 1;
Code : constant VSS.Unicode.UTF8_Code_Unit :=
Storage (Position.UTF8_Offset);
Length : constant Interfaces.Integer_32 :=
VSS.Implementation.GCC.clz (Interfaces.Unsigned_32 (not Code))
- 24;

when 16#F0# .. 16#F4# =>
Position.UTF8_Offset := Position.UTF8_Offset + 4;
Position.UTF16_Offset := Position.UTF16_Offset + 2;
begin
if Code <= 16#7F# then
Position.UTF8_Offset := Position.UTF8_Offset + 1;
Position.UTF16_Offset := Position.UTF16_Offset + 1;

when others =>
raise Program_Error with "string data is corrupted";
end case;
else
Position.UTF8_Offset :=
Position.UTF8_Offset
+ VSS.Unicode.UTF8_Code_Unit_Offset (Length);
Position.UTF16_Offset :=
Position.UTF16_Offset
+ VSS.Unicode.UTF16_Code_Unit_Offset (Length / 4 + 1);
end if;
end;

-- XXX case statement above may be rewritten as below to avoid
-- use of branch instructions.
--
-- Position.UTF8_Offset :=
-- Position.UTF8_Offset + 1
-- + (if (Code and 2#1000_0000#) = 2#1000_0000# then 1 else 0)
-- + (if (Code and 2#1110_0000#) = 2#1110_0000# then 1 else 0)
-- + (if (Code and 2#1111_0000#) = 2#1111_0000# then 1 else 0);
--
-- Position.UTF16_Offset :=
-- Position.UTF16_Offset + 1
-- + (if (Code and 2#1111_0000#) = 2#1111_0000# then 1 else 0);
end Unchecked_Forward;

-----------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,60 @@ package body VSS.Strings.Cursors.Iterators.Grapheme_Clusters is
Is_Linker : Boolean) return Boolean;
-- Scan string backward to check whether Rule GB9c should be applied.

type GCB_Action is (Break, No_Break, Unspecified);

-- The table below encodes segmentation rules that depend only on the
-- value of the GCB property.

Forward_GCB_Rules : constant array
(VSS.Implementation.UCD_Core.GCB_Values,
VSS.Implementation.UCD_Core.GCB_Values) of GCB_Action :=
(GCB_CN => (others => Break), -- Rule GB4
GCB_CR =>
(GCB_LF => No_Break, -- Rule GB3
others => Break), -- Rule GB4
GCB_L =>
(GCB_CN | GCB_CR | GCB_LF => Break, -- Rule GB5
GCB_L | GCB_V | GCB_LV | GCB_LVT => No_Break, -- Rule GB6
GCB_EX | GCB_ZWJ => No_Break, -- Rule GB9
GCB_SM => No_Break, -- Rule GB9a
others => Unspecified),
GCB_LF => (others => Break), -- Rule GB4
GCB_LV =>
(GCB_CN | GCB_CR | GCB_LF => Break, -- Rule GB5
GCB_V | GCB_T => No_Break, -- Rule GB7
GCB_EX | GCB_ZWJ => No_Break, -- Rule GB9
GCB_SM => No_Break, -- Rule GB9a
others => Unspecified),
GCB_LVT =>
(GCB_CN | GCB_CR | GCB_LF => Break, -- Rule GB5
GCB_T => No_Break, -- Rule GB8
GCB_EX | GCB_ZWJ => No_Break, -- Rule GB9
GCB_SM => No_Break, -- Rule GB9a
others => Unspecified),
GCB_PP =>
(GCB_CN | GCB_CR | GCB_LF => Break, -- Rule GB5
GCB_EX | GCB_ZWJ => No_Break, -- Rule GB9
GCB_SM => No_Break, -- Rule GB9a
others => No_Break), -- Rule GB9b
GCB_T =>
(GCB_CN | GCB_CR | GCB_LF => Break, -- Rule GB5
GCB_T => No_Break, -- Rule GB8
GCB_EX | GCB_ZWJ => No_Break, -- Rule GB9
GCB_SM => No_Break, -- Rule GB9a
others => Unspecified),
GCB_V =>
(GCB_CN | GCB_CR | GCB_LF => Break, -- Rule GB5
GCB_V | GCB_T => No_Break, -- Rule GB7
GCB_EX | GCB_ZWJ => No_Break, -- Rule GB9
GCB_SM => No_Break, -- Rule GB9a
others => Unspecified),
GCB_EX | GCB_RI | GCB_SM | GCB_XX | GCB_ZWJ =>
(GCB_CN | GCB_CR | GCB_LF => Break, -- Rule GB5
GCB_EX | GCB_ZWJ => No_Break, -- Rule GB9
GCB_SM => No_Break, -- Rule GB9a
others => Unspecified));

-------------------
-- Apply_ExtPict --
-------------------
Expand Down Expand Up @@ -411,6 +465,7 @@ package body VSS.Strings.Cursors.Iterators.Grapheme_Clusters is
Left : VSS.Implementation.Strings.Cursor;
Left_Properties : VSS.Implementation.UCD_Core.Core_Data_Record;
Right : VSS.Implementation.Strings.Cursor;
Right_Code : VSS.Unicode.Code_Point'Base;
Right_Properties : VSS.Implementation.UCD_Core.Core_Data_Record;
Success : Boolean;
Done : Boolean := False;
Expand All @@ -426,92 +481,45 @@ package body VSS.Strings.Cursors.Iterators.Grapheme_Clusters is
Handler := VSS.Implementation.Strings.Handler (Data);

Self.First_Position := Self.Last_Position;
Success := Handler.Forward (Data, Self.First_Position);
Success :=
Handler.Forward_Element (Data, Self.First_Position, Right_Code);

if not Success then
-- End of the string has been reached.
-- XXX Should Last_Position be set to After_Last_Character?

return False;
end if;

else
Right := Self.First_Position;
Right_Properties :=
Extract_Core_Data (Handler.Element (Data, Right));

loop
Left := Right;
Left_Properties := Right_Properties;

Success := Handler.Forward (Data, Right);

if not Success then
-- End of line has been reached
-- Rule GB2

Self.Last_Position := Left;

return True;

else
Right_Properties :=
Extract_Core_Data (Handler.Element (Data, Right));

if Left_Properties.GCB = GCB_CR
and Right_Properties.GCB = GCB_LF
then
-- Rule GB3

null;

elsif Left_Properties.GCB in GCB_CN | GCB_CR | GCB_LF then
-- Rule GB4

Done := True;

elsif Right_Properties.GCB in GCB_CN | GCB_CR | GCB_LF then
-- Rule GB5

Done := True;

elsif Left_Properties.GCB = GCB_L
and then Right_Properties.GCB
in GCB_L | GCB_V | GCB_LV | GCB_LVT
then
-- Rule GB6

null;

elsif Left_Properties.GCB in GCB_LV | GCB_V
and then Right_Properties.GCB in GCB_V | GCB_T
then
-- Rule GB7

null;
Right := Self.First_Position;
Right_Properties := Extract_Core_Data (Right_Code);

elsif Left_Properties.GCB in GCB_LVT | GCB_T
and then Right_Properties.GCB = GCB_T
then
-- Rule GB8
loop
Left := Right;
Left_Properties := Right_Properties;

null;
Success := Handler.Forward_Element (Data, Right, Right_Code);

elsif Right_Properties.GCB in GCB_EX | GCB_ZWJ then
-- Rule GB9
if not Success then
-- End of line has been reached
-- Rule GB2

null;
Self.Last_Position := Left;

elsif Right_Properties.GCB = GCB_SM then
-- Rule GB9a
return True;
end if;

null;
Right_Properties := Extract_Core_Data (Right_Code);

elsif Left_Properties.GCB = GCB_PP then
-- Rule GB9b
case Forward_GCB_Rules (Left_Properties.GCB, Right_Properties.GCB) is
when Break =>
Done := True;

null;
when No_Break =>
null;

elsif Left_Properties.InCB in INCB_Linker | INCB_Extend
when Unspecified =>
if Left_Properties.InCB in INCB_Linker | INCB_Extend
and then Right_Properties.InCB = INCB_Consonant
and then Apply_InCB
(Handler, Data, Left, Left_Properties.InCB = INCB_Linker)
Expand Down Expand Up @@ -540,15 +548,14 @@ package body VSS.Strings.Cursors.Iterators.Grapheme_Clusters is
else
Done := True;
end if;
end case;

if Done then
Self.Last_Position := Left;
if Done then
Self.Last_Position := Left;

return True;
end if;
end if;
end loop;
end if;
return True;
end if;
end loop;
end Forward;

-----------------
Expand Down

0 comments on commit f4cb759

Please sign in to comment.