-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexample - text compression with word repetition.au3
187 lines (147 loc) · 5.74 KB
/
example - text compression with word repetition.au3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#include "Huffman.au3"
; read out the autoitscript.com page as example string
Global $sString = BinaryToString(InetRead("https://autoitscript.com"))
ConsoleWrite("Original Size: " & BinaryLen(StringToBinary($sString)) & " Bytes" & @CRLF)
; precompress at text level by word repetition:
Global $sStringCompressed = _strcmp_compressByWordRepetition($sString)
; encode the string with a canonical huffman encoding
Global $bEncoded = _huff_encodeString($sStringCompressed)
ConsoleWrite("Encoded Size: " & BinaryLen($bEncoded) & " Bytes" & @CRLF & _
"Compress ratio: " & Round((1 - (BinaryLen($bEncoded) / BinaryLen(StringToBinary($sString)))) * 100.0, 1) & ' %' & @CRLF)
; decode the string out of the binary
Global $sStringDecoded = _huff_decodeBinary($bEncoded)
; decompress at text level
$sStringDecoded = _strcmp_decompressByWordRepetition($sStringDecoded)
; check if original and decoded string are the same
ConsoleWrite("$sString == $sStringDecoded: " & ($sString == $sStringDecoded) & @CRLF)
; Tokenbasierte Kompression - ersetzt wiederholende Teile durch Nummern im Text
Func _strcmp_compressByWordRepetition($sString)
Local $mTmp[3], $aTmp[2], $mPos, $iLen, _
$mWords[], _
$iPos = 1, _
$sWord, $iWord = 0
; Bereits vorhandene Präfixnummern escapen:
$sString = StringRegExpReplace($sString, '(?s)\b(\d+)', Chr(27) & "$1")
; Einzelne Wörter aus dem String extrahieren
Do
$aMatch = StringRegExp($sString, '(?s)\G.*?(\b[a-zA-ZäöüßÄÖÜ][\wäöüßÄÖÜ]{2,}\b)', 1, $iPos)
IF @error Then ExitLoop
$iPos = @extended
$sWord = $aMatch[0]
If MapExists($mWords, $sWord) Then
$aTmp = $mWords[$sWord]
$mTmp = $aTmp[1]
MapAppend($mTmp, $iPos - StringLen($sWord))
$aTmp[1] = $mTmp
Else
Local $mTmp[]
Local $aTmp[2] = [$iWord, $mTmp]
EndIf
$mWords[$sWord] = $aTmp
$iWord += 1
Until 0
; bereite die notwendigen Replaces auf
Local $aReplaces[$iWord][3], $iIndRepl = 0
For $sWord In MapKeys($mWords)
$iLen = StringLen($sWord)
$aTmp = $mWords[$sWord]
$iWord = $aTmp[0]
$mPos = $aTmp[1]
If Ubound($mPos) < 1 Then ContinueLoop
If $iLen <= Stringlen($iWord) Then ContinueLoop
For $i In MapKeys($mPos)
$iPos = $mPos[$i]
$aReplaces[$iIndRepl][0] = $iPos
$aReplaces[$iIndRepl][1] = $iLen
$aReplaces[$iIndRepl][2] = $iWord
$iIndRepl += 1
Next
Next
Redim $aReplaces[$iIndRepl][3]
_ArraySort($aReplaces)
; Wörter durch Nummern im Text selbst ersetzen
For $i = UBound($aReplaces) - 1 To 0 Step -1
$sString = StringLeft($sString, $aReplaces[$i][0] - 1) & _
$aReplaces[$i][2] & _
StringTrimLeft($sString, $aReplaces[$i][0] + $aReplaces[$i][1] -1)
Next
Return $sString
EndFunc
; Tokenbasierte Dekompression - stellt einen mit _strcmp_compressByWordRepetition() komprimierten String wieder her
Func _strcmp_decompressByWordRepetition($sString)
Local $aTmp[3], _
$mWords[]
$iPos = 1
; Einzelne Wörter aus dem String extrahieren
Do
$aMatch = StringRegExp($sString, '(?s)\G.*?(\b[a-zA-ZäöüßÄÖÜ][\wäöüßÄÖÜ]{2,}\b|\b(?<!\x1B)\d+\b)', 1, $iPos)
IF @error Then ExitLoop
$iPos = @extended
$aTmp[0] = $aMatch[0]
$aTmp[1] = $iPos - StringLen($aMatch[0])
$aTmp[2] = StringLen($aMatch[0])
MapAppend($mWords, $aTmp)
Until 0
Local $aWords = _map_IntMap2Array($mWords)
$aWords = _ArrayAinATo2d($aWords)
; Nummern wieder durch Buchstaben ersetzen
Local $iWord
For $i = UBound($aWords) - 1 To 0 Step -1
If Not StringRegExp($aWords[$i][0], "^\d+$", 0) Then ContinueLoop
$iWord = Int($aWords[$i][0])
If $iWord >= UBound($aWords) Then ContinueLoop
;~ ConsoleWrite($iWord & @CRLF)
$sString = StringLeft($sString, $aWords[$i][1] - 1) & _
$aWords[$iWord][0] & _
StringTrimLeft($sString, $aWords[$i][1] + $aWords[$i][2] -1)
Next
; Escapte Zahlen wieder unescapen:
$sString = StringRegExpReplace($sString, '(?s)\x1B(\d+)', "$1")
Return $sString
EndFunc
; #FUNCTION# ======================================================================================
; Name ..........: _map_IntMap2Array()
; Description ...: returns the values of a map only (useful for array-list like maps produced by MapAppend())
; Syntax ........: _map_IntMap2Array(ByRef $aMap)
; Parameters ....: ByRef $aMap - a map variable
; Return values .: 1D-array containing the values
; Author ........: aspirinjunkie
; Modified ......: 2022-07-13
; =================================================================================================
Func _map_IntMap2Array(ByRef $aMap)
Local $aRet[UBound($aMap)]
Local $iInd = 0
For $vEl In $aMap
$aRet[$iInd] = $vEl
$iInd += 1
Next
Return $aRet
EndFunc ;==>_map_IntMap2Array
; #FUNCTION# ======================================================================================
; Name ..........: _ArrayAinATo2d()
; Description ...: Convert a Arrays in Array into a 2D array
; Syntax ........: _ArrayAinATo2d(ByRef $A)
; Parameters ....: $A - the arrays in array which should be converted
; Return values .: Success: a 2D Array build from the input array
; Failure: Null
; @error = 1: $A is'nt an 1D array
; = 2: $A is empty
; = 3: first element isn't a array
; Author ........: AspirinJunkie
; =================================================================================================
Func _ArrayAinATo2d(ByRef $A)
If UBound($A, 0) <> 1 Then Return SetError(1, UBound($A, 0), Null)
Local $N = UBound($A)
If $N < 1 Then Return SetError(2, $N, Null)
Local $u = UBound($A[0])
If $u < 1 Then Return SetError(3, $u, Null)
Local $a_Ret[$N][$u]
For $i = 0 To $N - 1
Local $t = $A[$i]
If UBound($t) > $u Then ReDim $a_Ret[$N][UBound($t)]
For $j = 0 To UBound($t) - 1
$a_Ret[$i][$j] = $t[$j]
Next
Next
Return SetExtended($N, $a_Ret)
EndFunc ;==>_ArrayAinATo2d