-
Notifications
You must be signed in to change notification settings - Fork 1
/
worker.go
295 lines (239 loc) · 6.07 KB
/
worker.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
package conveyor
import (
"bytes"
"errors"
"fmt"
"io"
"sync"
)
// Chosen by a fair dice roll.
const DefaultOverflowScanSize = 1024
var (
ErrNoLinebreakInChunk = errors.New("no linebreak found in buff")
)
// All buffs and handles are kept allocated for all iterations of Worker.Process.
type Worker struct {
Id int
TasksChan chan Chunk
resultChan chan ChunkResult
waitGroup *sync.WaitGroup
chunkSize int64
lineProcessor LineProcessor
handle io.ReadSeekCloser
handleName string
chunk *Chunk
chunkResult *ChunkResult
buff []byte
overflowBuff []byte
outBuff []byte
buffHead int
overflowBuffHead int
outBuffHead int
}
// NewWorker returns a new Worker
func NewWorker(
id int,
tasks chan Chunk,
result chan ChunkResult,
lineProcessor LineProcessor,
chunkSize int64,
overflowScanSize int,
waitGroup *sync.WaitGroup,
) *Worker {
return &Worker{
Id: id,
TasksChan: tasks,
resultChan: result,
waitGroup: waitGroup,
chunkSize: chunkSize,
lineProcessor: lineProcessor,
buff: make([]byte, chunkSize),
outBuff: make([]byte, chunkSize),
overflowBuff: make([]byte, overflowScanSize),
buffHead: 0,
outBuffHead: 0,
}
}
// Work processes chunks from Worker.TasksChan until queue is empty
func (w *Worker) Work() {
defer w.waitGroup.Done()
for chunk := range w.TasksChan {
w.chunk = &chunk
w.chunkResult = &ChunkResult{Chunk: chunk}
w.chunkResult.Err = w.Process()
w.resultChan <- *w.chunkResult
}
}
func (w *Worker) Process() error {
defer w.resetBuffers()
err := w.prepareFileHandles()
if err != nil {
return fmt.Errorf("error while preparing file handles: %w", err)
}
err = w.readChunkInBuff()
if err != nil {
return fmt.Errorf("error while reading Chunk in buff: %w", err)
}
err = w.prepareBuff()
if err != nil {
return fmt.Errorf("error while preparing buff: %w", err)
}
err = w.processBuff()
if err != nil {
return fmt.Errorf("error while processing buff: %w", err)
}
err = w.writeOutBuff()
if err != nil {
return fmt.Errorf("error while writing output: %w", err)
}
return nil
}
func (w *Worker) prepareBuff() error {
if w.chunk.Offset != 0 {
i := bytes.IndexByte(w.buff, '\n')
if i == -1 {
return ErrNoLinebreakInChunk
}
w.buffHead += i + 1
w.chunkResult.RealOffset = w.chunk.Offset + int64(i)
}
if !w.chunkResult.EOF {
err := w.readOverflowInBuff()
if err != nil {
return err
}
w.chunkResult.RealSize += len(w.overflowBuff)
}
return nil
}
// prepareFileHandles creates the main read handle and sets
// the read offset.
func (w *Worker) prepareFileHandles() (err error) {
if w.handle == nil || w.chunk.In.GetHandleID() != w.handleName {
w.handle, err = w.chunk.In.OpenHandle()
if err != nil {
return
}
}
_, err = w.handle.Seek(w.chunk.Offset, io.SeekStart)
return
}
// resetBuffers extend the size of all buffers to their cap and
// resets all buffer heads.
func (w *Worker) resetBuffers() {
w.buff = w.buff[:cap(w.buff)]
w.overflowBuff = w.overflowBuff[:cap(w.overflowBuff)]
w.outBuff = w.outBuff[:cap(w.outBuff)]
w.buffHead = 0
w.outBuffHead = 0
w.overflowBuffHead = 0
}
// readChunkInBuff reads up to len(worker.buff) bytes from the file.
func (w *Worker) readChunkInBuff() (err error) {
w.chunkResult.RealSize, err = w.handle.Read(w.buff)
if w.chunkResult.RealSize != w.chunk.Size {
w.buff = w.buff[:w.chunkResult.RealSize]
w.chunkResult.EOF = true
}
return
}
// readOverflowInBuff reads chunks of size DefaultOverflowScanSize until the next
// linebreak has been found.
func (w *Worker) readOverflowInBuff() error {
buffSize := len(w.overflowBuff)
for {
scanBuff := w.overflowBuff[w.overflowBuffHead:]
if _, err := w.handle.Read(scanBuff); err != nil {
return err
}
i := bytes.IndexByte(scanBuff, '\n')
if i != -1 {
w.overflowBuffHead += i
w.overflowBuff = w.overflowBuff[:w.overflowBuffHead]
break
}
w.overflowBuffHead = buffSize
buffSize += buffSize
newBuff := make([]byte, buffSize)
copy(newBuff, w.overflowBuff)
w.overflowBuff = newBuff
}
return nil
}
// processBuff converts all the json content in Worker.buff and
// Worker.overflowBuff to csv and safes it into Worker.outBuff
func (w *Worker) processBuff() error {
var relativeIndex int
for {
relativeIndex = bytes.IndexByte(w.buff[w.buffHead:], '\n')
if relativeIndex == -1 {
if err := w.processOverflowLine(); err != nil {
return fmt.Errorf("error while processing last Line of Chunk: %w", err)
}
break
}
if err := w.processLine(relativeIndex + 1); err != nil {
return fmt.Errorf("error while processing Line of Chunk: %w", err)
}
if w.buffHead == w.chunkResult.RealSize {
break
}
}
return nil
}
func (w *Worker) processLine(relativeIndex int) error {
line := w.buff[w.buffHead : w.buffHead+relativeIndex]
convertedLine, err := w.lineProcessor.Process(
line, LineMetadata{
WorkerId: w.Id,
Line: w.chunkResult.Lines + 1,
Chunk: w.chunk,
},
)
if err != nil {
return err
}
w.addToOutBuff(convertedLine)
w.buffHead += relativeIndex
w.chunkResult.Lines++
return nil
}
func (w *Worker) processOverflowLine() error {
remainingBuff := w.buff[w.buffHead:]
line := make([]byte, len(remainingBuff)+w.overflowBuffHead)
copy(line[:len(remainingBuff)], remainingBuff)
copy(line[len(remainingBuff):], w.overflowBuff)
convertedLine, err := w.lineProcessor.Process(
line,
LineMetadata{
WorkerId: w.Id,
Line: w.chunkResult.Lines + 1,
Chunk: w.chunk,
},
)
if err != nil {
return err
}
w.addToOutBuff(convertedLine)
w.chunkResult.Lines++
return nil
}
func (w *Worker) addToOutBuff(b []byte) {
if len(b) == 0 {
return
}
if w.outBuffHead+len(b) > len(w.outBuff) {
w.outBuff = append(w.outBuff[:w.outBuffHead], b...)
w.outBuff = w.outBuff[:cap(w.outBuff)]
} else {
copy(w.outBuff[w.outBuffHead:], b)
}
w.outBuffHead += len(b)
}
func (w *Worker) writeOutBuff() (err error) {
if w.outBuffHead > 0 && w.chunk.Out != nil {
outBuff := w.outBuff[:w.outBuffHead]
err = w.chunk.Out.Write(w.chunk, outBuff)
}
return
}