forked from jcjohnson/torch-rnn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_data_to_train.py
executable file
·59 lines (44 loc) · 1.46 KB
/
get_data_to_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/env python2.7
"""get_aws.py:
"""
__author__ = "Dilawar Singh"
__copyright__ = "Copyright 2017-, Dilawar Singh"
__version__ = "1.0.0"
__maintainer__ = "Dilawar Singh"
__email__ = "dilawars@ncbs.res.in"
__status__ = "Development"
import sys
sys.path.append( '..' )
import os
import html2other
import re
from db_connect import db_
data_ = [ ]
def strip_image( txt ):
pat = re.compile( r'(\<img.*?src=\".+?\".*?\s*\/\>)', re.DOTALL )
return pat.sub( '', txt )
def main( ):
global db_
global titles_, abstract_
global data_
cur = db_.cursor( dictionary = True )
cur.execute( 'SELECT title, description FROM talks' )
for a in cur.fetchall( ):
data_.append( '<br> %s </br>' % a[ 'title' ] + '<br>'
+ strip_image( a['description'] ) )
cur.execute( 'SELECT title, abstract FROM annual_work_seminars' )
for a in cur.fetchall( ):
data_.append( '<br> %s </br>' % a[ 'title' ] + '<br>' + strip_image(
a['abstract'] )
)
with open( '/tmp/_sample.html', 'w' ) as f:
f.write( ' '.join( data_ ) )
aws, awsf = html2other.tomd( '/tmp/_sample.html' )
aws = aws.replace( r'\\', '' )
words = set( re.findall( r'\w+', aws ) )
with open( '_words', 'w' ) as f:
f.write( '\n'.join( words ) )
with open( '/tmp/data.txt', 'w' ) as f:
f.write( aws )
if __name__ == '__main__':
main()