-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathunified_tsv.py
36 lines (29 loc) · 1.09 KB
/
unified_tsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from pathlib import Path
import shutil
def main(args):
with open(args.result_path / f"{args.subset}.txt", "w") as fw:
for i in range(args.num_shards):
print(f"shard {i}")
with open(args.result_path / f"{args.subset}_shard_{i}" / f"generate-{args.subset}.txt", "r") as fp:
for line in fp.readlines():
x = line.strip().split("\t")
if len(x[0].split("-")) != 2:
continue
x[0] = x[0].split("-")[0] + "-" + str(int(x[0].split("-")[-1]) * args.num_shards + i)
fw.writelines("\t".join(x) + "\n")
for i in range(args.num_shards):
shutil.rmtree(Path(args.result_path / f"{args.subset}_shard_{i}"))
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument(
"--num-shards", type=int
)
parser.add_argument(
"--result-path", type=Path
)
parser.add_argument(
"--subset", type=str
)
args = parser.parse_args()
main(args)