-
Notifications
You must be signed in to change notification settings - Fork 0
/
lzw.cc
185 lines (159 loc) · 5.38 KB
/
lzw.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
//
// Created by tunc on 04/10/2021.
//
#include <unordered_map>
#include <string>
#include "lzw.hh"
#include "bitio.hh"
// Constructor
LZW::LZW() = default;
// Destructor
LZW::~LZW() = default;
void LZW::encode(std::ifstream* is, std::ofstream* os) {
// Create symbol to code table
// To make this code work faster, I would probably want to optimize this table, and not use these strings as keys
std::unordered_map<std::string, int> table;
for (int i = 0; i < LZW::ALPHABET_SIZE; ++i){
table[std::to_string(i)] = i;
}
int code_size = LZW::STARTING_CODE_SIZE;
// Since the first code is 0, the next code is ALPHABET_SIZE
int next_code = LZW::ALPHABET_SIZE;
// When this is reached, we must increase the code size
int next_increase_point = LZW::ALPHABET_SIZE<<1;
BitIO bitio(os, nullptr);
// K is used to read from is
int K;
// Used to not call to_string on K multiple times.
std::string str_K;
// Start encoding
K = is->get();
std::string w = std::to_string(K);
// This is here instead of while(true) in case input has only 1 byte
// K will be -1 at EOF
while (K != -1){
K = is->get();
// get returns -1 if EOF
if (K == -1){
break;
}
// If our table is full, expand
if (next_code == next_increase_point){
code_size += 1;
next_increase_point <<= 1; // multiply by 2
}
str_K = std::to_string(K);
// If w+K is not new, build up w
if (table.contains(w+","+str_K)){
w += (","+str_K);
}
// New pattern
else{
int code = table[w];
// write code_size many bits, from left to right
// the first couple of bits might need to be 0 if code_size is longer than actual length of code
for (int i = code_size-1; i>=0; --i){
bitio.output_bit((code>>i)&1);
}
table[w+","+str_K] = next_code;
next_code += 1;
w = str_K;
}
}
// Write the last input
int code = table[w];
for (int i = code_size-1; i>=0; --i){
bitio.output_bit((code>>i)&1);
}
}
// For decoding table
// Instead of storing arbitrary long strings, just point to what should come before
struct symbol_type{
symbol_type* prev_symbol;
char curr_symbol;
};
// Reads `code_size` many bits from bitio to get the correct code
// bitio reads bit by bit, so this function abstracts the repetition during decoding
static int read_code(int code_size, BitIO* bitio){
int b = 0;
for (int i = 0; i < code_size; ++i){
b <<= 1;
b |= bitio->input_bit();
}
return b;
}
// Returns the first symbol to be saved in `FINchar` during decoding
static char write_symbol_and_return_first(symbol_type symbol, std::ofstream *os){
if (symbol.prev_symbol != nullptr){
char first_symbol = write_symbol_and_return_first(*symbol.prev_symbol, os);
os->put(symbol.curr_symbol);
return first_symbol;
}
else{
os->put(symbol.curr_symbol);
return symbol.curr_symbol;
}
}
void LZW::decode(std::ifstream *is, std::ofstream *os) {
// Create symbol to code table
std::unordered_map<int, symbol_type> table;
for (int i = 0; i < LZW::ALPHABET_SIZE; ++i){
symbol_type s{nullptr, (char)i};
table[i] = s;
}
int code_size = LZW::STARTING_CODE_SIZE;
int next_code = LZW::ALPHABET_SIZE;
int next_increase_point = LZW::ALPHABET_SIZE<<1;
BitIO bitio(nullptr, is);
// First code decoding
// I am assuming the file will be nonempty
int code = read_code(code_size, &bitio);
int OLDcode = code;
int INcode;
symbol_type symbol = table[code];
os->put(symbol.curr_symbol);
char FINchar = symbol.curr_symbol;
// main decoding loop
while (true){
// if table has only 1 slot left, the next new code will fill that
// in which case encoding would increase code size
// so we must as well
if (next_code == next_increase_point - 1){
code_size += 1;
next_increase_point <<= 1;
}
code = read_code(code_size, &bitio);
// we can check for eof only after trying to read beyond what is in file
// Since our codes are longer than a byte, this happens exactly when there is nothing more to read
if (bitio.reached_eof()){
break;
}
INcode = code;
// If we haven't seen the code before
// it must be KwKwK (Kw was before so this is KwK)
if (!table.contains(code)){
// write Kw
code = OLDcode;
symbol = table[code];
FINchar = write_symbol_and_return_first(symbol, os);
// add new code
symbol_type next_symbol{&table[OLDcode], FINchar};
table[next_code] = next_symbol;
INcode = next_code;
OLDcode = INcode;
// write last K
os->put(FINchar);
next_code += 1;
}
else{
// write the pattern matching previously seen code
symbol = table[code];
FINchar = write_symbol_and_return_first(symbol, os);
// add new code
symbol_type next_symbol{&table[OLDcode], FINchar};
table[next_code] = next_symbol;
next_code += 1;
OLDcode = INcode;
}
}
}