src/helper/cnn_dm_downloader.py

# The directory that contains all *.story files: ~/data/cnn_dm/cnn/
# How to run to extract (article, title) pairs:
# python cnn_dm_downloader.py ~/data/cnn_dm/cnn/ ~/data/cnn_dm/cnn/processed/ article
# How to run to extract the highlights:
# python cnn_dm_downloader.py ~/data/cnn_dm/cnn/ ~/data/cnn_dm/cnn/processed/ highlight
# It will generate three directories as follows: ~/data/cnn_dm/cnn/processed/[article,title,highlight]


from glob import glob
from unidecode import unidecode
from multiprocessing import Pool, cpu_counts
import os, sys
from newspaper import Article # require python 3 for this

try:
    reload(sys)
    sys.setdefaultencoding('utf-8') 
except:
    pass
from chardet.universaldetector import UniversalDetector

def encoding_detector(filename):
    detector = UniversalDetector()
    for line in open(filename, 'rb'):
        detector.feed(line)
        if detector.done: break
    detector.close()
    return detector.result['encoding']

def remove_non_ascii(text):
    try:
        return unidecode(unicode(text, encoding = "utf-8"))
    except:
        return unidecode(text)

def run(param):
    (article_dir, title_dir, html_path) = param
    try:
        raw_html = open(html_path, encoding="ascii", errors="surrogateescape").read().strip()
    except:
        raw_html = open(html_path, encoding=encoding_detector(html_path), errors="surrogateescape").read().strip()

    id = html_path.split('/')[-1].split('.')[0]
    a = Article('http:/www.dummy.com', language='en')
    a.download(input_html=raw_html)
    a.parse()
    title = a.title
    text = a.text
    title = remove_non_ascii(title)
    text = remove_non_ascii(text)
    fw = open('{}/{}'.format(article_dir, id),'w',encoding='utf-8')
    fw.write(text)
    fw.close()
    fw = open('{}/{}'.format(title_dir, id),'w',encoding='utf-8')
    fw.write(title)
    fw.close()

def extract_highlight(param):
    (indir, outdir, id) = param
    f = open('{}/{}.story'.format(indir, id))
    lines = f.readlines()
    highlights = []
    for i, line in enumerate(lines):
        if "@highlight" in line.strip():
            try:
                highlights.append('{}.'.format(lines[i+2].strip()))
            except:
                continue
    fw = open('{}/{}'.format(outdir, id),'w')
    fw.write('\n'.join(highlights))
    fw.close()

indir = sys.argv[1]
outdir = sys.argv[2]
mode = sys.argv[3] # highlight/article

if mode == 'article':
    article_dir = '{}/articles'.format(outdir)
    title_dir = '{}/title'.format(outdir)
    if not os.path.exists(article_dir):
        os.makedirs(article_dir)
    if not os.path.exists(title_dir):
        os.makedirs(title_dir)

    params = [(article_dir, title_dir, k) for k in glob('{}/*.html'.format(indir))]
    print('processing {} files...'.format(len(params)))

    pool = Pool(cpu_counts())
    pool.map(run, params, 1000)
    pool.close()
else:
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    params = [(indir, outdir, k.split('/')[-1].split('.')[0]) for k in glob('{}/*.story'.format(indir))]
    print('processing {} files...'.format(len(params)))
    pool = Pool(cpu_counts())
    pool.map(extract_highlight, params, 1000)
    pool.close()