-
Notifications
You must be signed in to change notification settings - Fork 0
/
doc_extractor.py
45 lines (32 loc) · 1.42 KB
/
doc_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import argparse
from pathlib import Path
from validate_directory import ValidateDirectory
from article_writer import ArticleWriter
def main():
parser = _create_arg_parser()
args = parser.parse_args()
writer = ArticleWriter(output_dir=args.target, file_extension=args.extension)
_extract_articles_in_directory(Path(args.source), writer)
def _create_arg_parser():
parser = argparse.ArgumentParser(description="merge files into a single document")
parser.add_argument("--source", required=True, help="source directory, which contains the files to be merged",
action=ValidateDirectory)
parser.add_argument("--target", required=True, help="output directory to write the documents to",
action=ValidateDirectory)
parser.add_argument("--extension", required=False, help="output file extension", default="article")
return parser
def _extract_articles_in_directory(parent, article_writer):
for file in parent.iterdir():
if file.is_file():
_extract_articles_from_file(file, article_writer)
continue
if file.is_dir():
_extract_articles_in_directory(file, article_writer)
def _extract_articles_from_file(file, article_writer):
with file.open() as f:
line = f.readline()
while line:
article_writer.write_line(line)
line = f.readline()
if __name__ == "__main__":
main()