-
Notifications
You must be signed in to change notification settings - Fork 0
/
hw_1_data_set_check_multiple_topic.py
103 lines (86 loc) · 4.31 KB
/
hw_1_data_set_check_multiple_topic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from typing import TextIO
from bs4 import BeautifulSoup
import re
import math
import itertools
import copy
text_content = []
topicname_list = []
def get_article_by_bs4() -> None:
"""
get reuters aritcle example code
:return: None because example code
"""
dataDir = "./" # If you are in the same directory, you can use "./"
# dataDir = "YOUR_FILE_DIRECTORY" # If you are in the same directory, you can use "./"
sgmlFileFormat = dataDir + "reut2-{}.sgm"
for fileIdx in range(0, 22): # 22 last
# Get file IO stream
f: TextIO = open(sgmlFileFormat.format(str(fileIdx).zfill(3)), 'r')
content: str = f.read()
f.close()
# Parsing
bs = BeautifulSoup(content, 'lxml')
reuters: List[bs4.element.Tag] = bs.find_all('reuters')
for article in reuters:
# TOPICS = "YES"
# LEWISSPLIT = "TRAIN"
topic: str = article['topics']
lewissplit: str = article['lewissplit']
# print("article ", article)
newId: str = article['newid']
oldId: str = article['oldid']
textTag = article.find('text')
# print("textTag ", textTag)
topicD = article.find('topics')
# print("topicD ", topicD)
try:
title: str = textTag.find('title').get_text().strip()
except AttributeError: # no title content
title: str = ''
try:
topicname: str = topicD.find('d').get_text().strip()
except AttributeError: # no topic content
topicname: str = ''
# try:
# dateline: str = textTag.find('dateline').get_text().strip()
# except AttributeError: # no topic content
# dateline: str = ''
# print("NEWID : {}, OLDID : {}, TITLE : {} ".format(newId, oldId, title))
# print("topicname ", topicname)
# print("NEWID : {}, OLDID : {}, TITLE : {}, TOPICS: {}".format(newId, oldId, title, topicname))
# print("NEWID : {}, OLDID : {}, TITLE : {}, TOPICS_NAME: {}, TOPICS: {}, LEWISSPLIT: {}".format(newId, oldId,
# title,
# topicname,
# topic,
# lewissplit))
if topic == "YES" and lewissplit == "TRAIN" and topicname != "":
# print("NEWID : {}, OLDID : {}, TITLE : {}, TOPICS_NAME: {}, TOPICS: {}, LEWISSPLIT: {}".format(newId, oldId, title, topicname, topic, lewissplit))
print("TOPICS_NAME: ", topicname, "DATA TYPE: ", lewissplit)
# print("TITLE : {}, TOPICS_NAME: {}".format(title, topicname))
# print("TITLE :", title)
# print("dateline :", dateline)
# with open("train_data.sgm", "a") as myfile:
# myfile.write(str(article))
# myfile.write("\n")
text_content.append(textTag)
topicname_list.append(topicname)
if topic == "YES" and lewissplit == "TEST":
# print("NEWID : {}, OLDID : {}, TITLE : {}, TOPICS_NAME: {}, TOPICS: {}, LEWISSPLIT: {}".format(newId, oldId, title, topicname, topic, lewissplit))
print("TOPICS_NAME: ", topicname, "DATA TYPE: ", lewissplit)
# with open("test_data.sgm", "a") as myfile:
# myfile.write(str(article))
# myfile.write("\n")
text_content.append(textTag)
topicname_list.append(topicname)
if __name__ == "__main__":
get_article_by_bs4()
# dict = {'title': text_content,
# 'topicname': topicname_list}
# csv_df = pd.DataFrame(dict)
# # saving the dataframe
# csv_df.to_csv("test_data.csv", encoding='utf-8', index=False)