-
Notifications
You must be signed in to change notification settings - Fork 0
/
normalize.js
434 lines (382 loc) · 15.9 KB
/
normalize.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
/*------------------------------------*\
Imports
\*------------------------------------*/
const path = require('path');
const fs = require('fs-extra');
const _ = require('lodash');
/*------------------------------------*\
Sources
\*------------------------------------*/
const SOURCES_PATH = path.join(__dirname, './sources');
const SOURCES_ISO3166_PATH = path.join(SOURCES_PATH, './iana/tzdb-2020a/iso3166.tab');
const SOURCES_ZONE1970_PATH = path.join(SOURCES_PATH, './iana/tzdb-2020a/zone1970.tab');
const SOURCES_TO2050_PATH = path.join(SOURCES_PATH, './iana/tzdb-2020a/to2050.tzs');
const SOURCES_TZDATA_PATH = path.join(SOURCES_PATH, './iana/tzdb-2020a/tzdata.zi');
const SOURCES_NORMALIZED_PATH = path.join(SOURCES_PATH, './iana/tzdb-2020a-normalized');
const SOURCES_NORMALIZED_RULES_PATH = path.join(SOURCES_NORMALIZED_PATH, './rules.json');
const SOURCES_NORMALIZED_LINKS_PATH = path.join(SOURCES_NORMALIZED_PATH, './links.json');
const SOURCES_NORMALIZED_ZONES_PATH = path.join(SOURCES_NORMALIZED_PATH, './zones.json');
const SOURCES_NORMALIZED_COUNTRIES_PATH = path.join(SOURCES_NORMALIZED_PATH, './countries.json');
const SOURCES_NORMALIZED_ONGOING_PATH = path.join(SOURCES_NORMALIZED_PATH, './ongoing.json');
/*------------------------------------*\
Normalize Links
\*------------------------------------*/
async function normalizeLinks() {
const content = await fs.readFile(SOURCES_TZDATA_PATH, 'utf8');
const lines = content.split(/\r?\n/).filter(Boolean);
const output = [];
for (const line of lines) {
if (line.trim().startsWith('#')) continue;
if (!line.trim().startsWith('L')) continue;
if (!line.trim()) continue;
const components = line.split(/\s/);
output.push({
target: components[1] || null,
source: components[2] || null,
});
}
const outputSorted = output.slice().sort((a, b) => (a.target > b.target) ? 1 : -1);
await fs.writeJson(SOURCES_NORMALIZED_LINKS_PATH, outputSorted, {
spaces: 2,
EOL: '\n',
encoding: 'utf8',
});
return true;
}
/*------------------------------------*\
Normalize Rules
\*------------------------------------*/
async function normalizeRules() {
const content = await fs.readFile(SOURCES_TZDATA_PATH, 'utf8');
const lines = content.split(/\r?\n/).filter(Boolean);
const output = [];
for (const line of lines) {
if (line.trim().startsWith('#')) continue;
if (!line.trim().startsWith('R')) continue;
if (!line.trim()) continue;
const components = line.split(/\s/);
const normalizedRule = {
name: components[1] || null,
from: components[2] || null,
to: components[3] || null,
type: components[4] || null,
in: components[5] || null,
on: components[6] || null,
at: components[7] || null,
save: components[8] || null,
letters: components[9] || null,
};
output.push(normalizedRule);
}
await fs.writeJson(SOURCES_NORMALIZED_RULES_PATH, output, {
spaces: 2,
EOL: '\n',
encoding: 'utf8',
});
return true;
}
/*------------------------------------*\
Normalize Zones
\*------------------------------------*/
async function normalizeZones() {
const content = await fs.readFile(SOURCES_TZDATA_PATH, 'utf8');
const lines = content.split(/\r?\n/).filter(Boolean);
const output = [];
let currentZone;
for (const line of lines) {
if (line.trim().startsWith('#')) continue;
if (line.trim().startsWith('R')) continue;
if (line.trim().startsWith('L')) continue;
if (!line.trim()) continue;
const components = line.split(/\s/);
if (components[0] === 'Z') (currentZone = components[1]);
if (components[0] !== 'Z') {
components.unshift(currentZone);
components.unshift('Z');
};
output.push({
name: components[1] || null,
stdoff: components[2] || null,
rules: components[3] || null,
format: components[4] || null,
until: components[5] || null,
until_in: components[6] || null,
until_on: components[7] || null,
until_at: components[8] || null,
});
}
await fs.writeJson(SOURCES_NORMALIZED_ZONES_PATH, output, {
spaces: 2,
EOL: '\n',
encoding: 'utf8',
});
return true;
}
/*------------------------------------*\
Normalize Countries
\*------------------------------------*/
async function normalizeCountries() {
const contentZone = await fs.readFile(SOURCES_ZONE1970_PATH, 'utf8');
const contentIso = await fs.readFile(SOURCES_ISO3166_PATH, 'utf8');
const linesZone = contentZone.split(/\r?\n/).filter(Boolean);
const linesIso = contentIso.split(/\r?\n/).filter(Boolean);
const names = [];
for (const line of linesIso) {
if (line.trim().startsWith('#')) continue;
if (!line.trim()) continue;
const components = line.split(/\t+/);
names.push({
code: components[0],
name: components[1],
});
}
const output = [];
for (const line of linesZone) {
if (line.trim().startsWith('#')) continue;
if (!line.trim()) continue;
const components = line.split(/\t+/);
const countries = components[0].split(',').map(code => {
const name = names.find(o => o.code === code).name;
return {code, name};
});
output.push({
tz: components[2],
countries: countries,
coordinates: components[1] || null,
comments: components[3] || null,
});
}
await fs.writeJson(SOURCES_NORMALIZED_COUNTRIES_PATH, output, {
spaces: 2,
EOL: '\n',
encoding: 'utf8',
});
return true;
}
/*------------------------------------*\
Normalize Ongoing
\*------------------------------------*/
/*
cliff notes taken from tz-how-to.html, theory.html and zic.8.txt
this is also useful https://www.ibm.com/support/knowledgecenter/ssw_aix_72/z_commands/zic.html
(although it seems to be from older version of IANA distribution)
zone format can have three forms
- applicable to unnamed zone rule ("-"):
- single string of three or more alphanumeric characters, beginning with either "+" or "-"
- applicable to named zone rules:
- pair of strings separated by "/", first string is for ST, second string is for DST
- single string containing "%s" which is replaced by LETTER/S value from appropriate rule
rules tells us whether DST is observed
- if rule is "-", then ST always applies
- if rule is amount of time, only the sum of ST and that amount matters
- if rule is named rule:
- if at least one transition happened, use the most recent
- if before any transition, assume ST, use earliest with SAVE=0 (applicable only to historical zones?)
rules have "to" (year), "in" (month), "on" (day), "at" (hour)
- note that values of those are even more abbreviated in tzdata.zi
- note that "on" can result in a day in neighboring month, possibly different from "in"
rule "to" possible values:
number (year)
"only" same year as "from"
"max" ongoing rule
rule "in" possible value:
string (month name)
rule "on" possible values:
"5" the fifth of the month
"lastSun" the last Sunday in the month
"lastMon" the last Monday in the month
"Sun>=8" first Sunday on or after the eighth
"Sun<=25" last Sunday on or before the 25th
rule "at" possible values:
"2" time in hours
"2:00" time in hours and minutes
"01:28:14" time in hours, minutes, and seconds
"00:19:32.13" time with fractional seconds
"12:00" midday, 12 hours after 00:00
"15:00" 3 PM, 15 hours after 00:00
"24:00" end of day, 24 hours after 00:00
"260:00" 260 hours after 00:00
"-2:30" 2.5 hours before 00:00
"-" equivalent to 0
*/
async function normalizeOngoing() {
const zones = await fs.readJson(SOURCES_NORMALIZED_ZONES_PATH, 'utf8');
const rules = await fs.readJson(SOURCES_NORMALIZED_RULES_PATH, 'utf8');
const links = await fs.readJson(SOURCES_NORMALIZED_LINKS_PATH, 'utf8');
// check if zone RULES is numerical value
// (check only in current zones without UNTIL)
// this doesn't happen in IANA 2020a distribution, but if any other distributions is used then
// throw error because this edge case might not be accounted for
for (const zone of zones.filter(z => !z.until)) {
if (/\d/gi.test(zone.rules)) throw new Error(
`TODO: account for zone rules that contain numerical or time value (${zone.name})`
);
}
// check if zone RULES is unnamed and zone FORMAT is variable (with "/" or "%s")
// (check only in current zones without UNTIL)
// this doesn't happen in IANA 2020a distribution, but if any other distributions is used then
// throw error because this edge case might not be accounted for
for (const zone of zones.filter(z => !z.until)) {
if (zone.rules === '-' && (zone.format.includes('%s') || zone.format.includes('/'))) throw new Error(
`TODO: account for zone unnamed rules where zone also has variable format (${zone.name})`
);
}
// check if there's any rule with "from" field being not a time value or a "minimum" or a "maximum" value
// this doesn't happen in IANA 2020a distribution, but if any other distributions is used then
// throw error because this edge case might not be accounted for
for (const rule of rules) {
if (!/^\d{4}$/.test(rule.from)) throw new Error(
`TODO: account for zone rule "from" value other than 4 digit year value`
);
}
// check if there's any rule with "to" field being "minimum" value
// this doesn't happen in IANA 2020a distribution, but if any other distributions is used then
// throw error because this edge case might not be accounted for
for (const rule of rules) {
if (!/^ma$|^o$|^\d{4}$/.test(rule.to)) throw new Error(
`TODO: account for zone rule "to" value other than "ma", "o" or 4 digit year value`
);
}
const output = [];
for (const zone of zones) {
// skip if zone is historical
if (zone.until) continue;
// get expanded zone rules
const zoneRules = rules.filter(rule => rule.name === zone.rules);
// augment rules with some values that will make some of next steps easier
const rulesAugmented = zoneRules.map(rule => augmentRule(rule));
// get zone rules sorted by "to" value
const rulesSortedByTo = rulesAugmented.slice().sort((a, b) => (a.to_combined > b.to_combined) ? 1 : -1);
// because "to_combined" doesn't account for days and hours,
// there's a chance for duplicates of that value if transition happened in the same year and month,
// which would make it later impossible to check which rule is the most recent
// this doesn't happen in IANA 2020a distribution, but if any other distributions is used then
// throw error because this edge case might not be accounted for
// get all rules to_combined to a single sorted array
// and compare the last two rules and see if they are equal
// if they are equal it means we don't know which one is actually most recent
const rulesCombineds = rulesSortedByTo.filter(r => r.to !== 'ma').map(r => r.to_combined).sort();
const rulesCombinedsLastTwo = rulesCombineds.slice(-2);
if (rulesCombinedsLastTwo.length && (rulesCombinedsLastTwo[0] === rulesCombinedsLastTwo[1])) {
throw new Error(
`TODO: normalize rule "to_combined" to a full date, also accounting for day ("on") and hour ("at") data` +
` (${zone.name}) (rule.to_combined=${rulesCombinedsLastTwo[0]})`
)
}
// get zone rules with TO=max
const rulesToMax = rulesAugmented.filter(rule => rule.to === 'ma');
// - if there is zero rules with TO=max, it means we later just need to find most recent transition
// - if there is one rule with TO=max, it means one other rule will be most recent transition (I think?)
// - if there is more than two rules with TO=max, I don't know what that means, is it even possible?
// this doesn't happen in IANA 2020a distribution, but if any other distributions is used then
// throw error because this edge case might not be accounted for
if (rulesToMax.length === 1 || rulesToMax.length > 2) throw new Error(
`TODO: account for amount of TO=max rules other than zero or two (${zone.name})`
);
// extract zone that has unnamed rules, zone observes only ST
if (zone.rules === '-') {
zone.rules = [];
output.push(zone);
continue;
}
// extract zone with future transitions, zone observes both ST and DST (similar to TO=max)
// get current year
// specific date is used instead Date.now(), so the same output from this repository is reproducible
// (if you're using this in future, you will probably need to update this repository anyway)
const now = new Date('2020-07-25T00:00:00Z');
const nowTo = now.getFullYear();
const nowIn = String(now.getMonth()+1).padStart(2, '0');
const nowCombined = `${nowTo}-${nowIn}`;
// find index of the last transition that happened (so, current transition)
// so find the next future transition and subtract 1
const ruleLastTransitionIndex = rulesSortedByTo.filter(r => r.to !== 'ma').findIndex(rule => {
return rule.to_combined > nowCombined;
}) - 1;
// not sure if possible, but throw error if the index here returns 0
// this doesn't happen in IANA 2020a distribution, but if any other distributions is used then
// throw error because this edge case might not be accounted for
if (ruleLastTransitionIndex === 0) throw new Error(
`TODO: account for ruleLastTransitionIndex being 0 (${zone.name})`
);
// finally, extract
if (ruleLastTransitionIndex > -1) {
const rulePrevTransition = rulesSortedByTo[ruleLastTransitionIndex-1];
const ruleLastTransition = rulesSortedByTo[ruleLastTransitionIndex];
const ruleNextTransition = rulesSortedByTo[ruleLastTransitionIndex+1];
// not very DRY, but,
// like before, check if we can actually see what transition
// was most recent with current implementation of "to_combined"
if (
(rulePrevTransition.to_combined === ruleLastTransition.to_combined) ||
(ruleLastTransition.to_combined === ruleNextTransition.to_combined)
) {
throw new Error(
`TODO: normalize rule "to_combined" to a full date, also accounting for day ("on") and hour ("at") data` +
` (${zone.name})`
)
}
zone.rules = [
ruleLastTransition,
ruleNextTransition,
];
output.push(zone);
continue;
}
// extract zone with TO=max, zone observes both ST and DST
// this must go after checking for future transitions, because e.g. in a 2020a distribution
// "Asia/Tehran" with "Iran" rule includes both future transitions and TO=max transitions
if (rulesToMax.length) {
zone.rules = rulesToMax;
output.push(zone);
continue;
}
// extract zone that uses a single (most recent) transition
zone.rules = rulesSortedByTo.slice(-1);
output.push(zone);
}
await fs.writeJson(SOURCES_NORMALIZED_ONGOING_PATH, output, {
spaces: 2,
EOL: '\n',
encoding: 'utf8',
});
return true;
}
/*------------------------------------*\
Augment Rule
\*------------------------------------*/
function augmentRule(rule) {
// abbreviations used by tzdata.zi
const months = ['Jan', 'F', 'Mar', 'Ap', 'May', 'Jun', 'Jul', 'Au', 'S', 'O', 'N', 'D'];
const days = ['M', 'Tu', 'W', 'Th', 'F', 'Sa', 'Su'];
// year
let ruleTo = rule.to;
if (rule.to === 'o') (ruleTo = rule.from);
// month
let ruleIn = rule.in;
ruleIn = months.findIndex(m => m === rule.in) + 1;
ruleIn = String(ruleIn).padStart(2, '0');
// day
// ...
// hour
// ...
// combined
// even if ruleTo=ma
const ruleToCombined = `${ruleTo}-${ruleIn}`;
return {
...rule,
//to_combined: '2000',
to_combined: ruleToCombined,
to_distinct: ruleTo, // same year as "from" if "to" was "o"
}
}
/*------------------------------------*\
Init
\*------------------------------------*/
async function normalize() {
await normalizeLinks();
await normalizeRules();
await normalizeZones();
await normalizeCountries();
await normalizeOngoing();
}
normalize();