-
Notifications
You must be signed in to change notification settings - Fork 1
/
lnparse.py
executable file
·48 lines (43 loc) · 1.27 KB
/
lnparse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
bestandsnaam="De_Telegraaf2014-03-22_22-08.TXT"
artikel=0
tekst={}
datum={}
section={}
length={}
loaddate={}
language={}
pubtype={}
journal={}
with open(bestandsnaam,"r") as f:
for line in f:
line=line.replace("\r","")
if line=="\n":
continue
matchObj=re.match(r"\s+(\d+) of (\d+) DOCUMENTS",line)
if matchObj:
# print matchObj.group(1), "of", matchObj.group(2)
artikel= int(matchObj.group(1))
#artikel+=1
tekst[artikel]=""
continue
if line.startswith("SECTION"):
section[artikel]=line.replace("SECTION: ","").rstrip("\n")
elif line.startswith("LENGTH"):
length[artikel]=line.replace("LENGTH: ","").rstrip("\n")
elif line.startswith("LOAD-DATE"):
loaddate[artikel]=line.replace("LOAD-DATE: ","").rstrip("\n")
elif line.startswith("LANGUAGE"):
language[artikel]=line.replace("LANGUAGE: ","").rstrip("\n")
elif line.startswith("PUBLICATION-TYPE"):
pubtype[artikel]=line.replace("PUBLICATION-TYPE: ","").rstrip("\n")
elif line.startswith("JOURNAL-CODE"):
journal[artikel]=line.replace("JOURNAL-CODE: ","").rstrip("\n")
elif line.lstrip().startswith("Copyright "):
pass
elif line.lstrip().startswith("All Rights Reserved"):
pass
else:
tekst[artikel]=tekst[artikel]+line