-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
28 lines (22 loc) · 953 Bytes
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# -*- coding: utf-8 -*-
"""
HVDC scraper
"""
import re
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
soup = BeautifulSoup( open("hvdc_norms_male.htm"), "html.parser" )
reports = re.findall(u'#\d\d\d\d (.+?)\s*?\([0-9]+\s*?words\)', soup.getText(), re.DOTALL)
characters = re.findall(u'Number: \d\d\d\d.*?OBJ.(?:\n{5})?(.*?)\n\n\n\n(?:\n|\t)', soup.getText(), re.DOTALL)
# exclude newlines from reports
reports = [report.replace('\n','') for report in reports]
# remove characters with a missing report
missing = [106,107,108,109,349,351,392,393,394] # hvdc male dreams without report
for i in sorted(missing, reverse=True):
del characters[i-1]
# keep a list of lists of characters
characters = [character.split('\n') for character in characters]
# save to a dataframe
df = pd.DataFrame(zip(reports,characters),columns=['Report','Characters'])
df.to_csv('hvdc_norms_male.csv',',',index=False)