etl_transcribe_stats.py

#!/usr/bin/env python
#  -*- coding: utf-8 -*-

import os
import argparse
import datetime
import time
import json
import re
import logging
logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
from collections import defaultdict
from speech2text import log_kv, make_dir

"""
What it does:
Merges and formats the output generated by two other stats scripts,
and writes the results to a TSV file.
"""

IBM_LOG_STATS_FILEPATH = "processed.json"
GOOGLE_LOG_STATS_FILEPATH = "gprocessed.json"
IBM_WORD_COUNT_FILEPATH = "file_stats.json"
GOOGLE_WORD_COUNT_FILEPATH = "gfile_stats.json"

IBM_BASE = "ibm_stt"
GOOGLE_BASE = "google_stt"
TRANSCRIBED_STATS_TSV = "transcribed_stats.tsv"


API_META = {"ibm" : {"base" : "ibm_stt", "transcripts" : ["hypotheses.txt", "hypotheses.txt.dictated"]},
            "google": {"base": "google_stt", "transcripts": ["transcript.txt", "transcript.txt.dictated"]}}

def etl_transcripts(log_stats_path, word_counts_path, api, ext=".out"):
    """
    Merges the two datasets using a canonicalized key.
    If the two share a filed then the value in <word_counts_path> will overwrite the value from <log_stats_path>
    :param log_stats_path: gives processing time
    :param word_counts_path: gives word counts
    :param api: "ibm" or "google"
    :param ext: extension stripped from basename
    :return: dict
    """
    result = {}
    loaded = {}
    if os.path.isfile(log_stats_path):
        with open(log_stats_path) as fp:
            log_kv("Loading(%s log stats)" % api, log_stats_path)
            loaded = json.load(fp)
    if loaded and type(loaded) is dict:
        log_kv("Count  (%s log stats)" % api, len(loaded))
    else:
        logging.error("Expected log stats data.")

    for key, row in loaded.items():
        id = key
        if key.startswith(API_META[api]["base"]+"/"):
            id = key.replace(API_META[api]["base"]+"/", "")
        result[id] = row

    counts = {}
    if os.path.isfile(word_counts_path):
        with open(word_counts_path) as fp:
            log_kv("Loading(%s word counts)" % api, word_counts_path)
            counts = json.load(fp)
    if counts and type(counts) is dict:
        log_kv("Count  (%s word counts)" % api, len(counts))
    else:
        logging.error("Expected word counts data.")

    for key, row in counts.items():
        id = key
        if key.startswith(API_META[api]["base"]+"/"):
            id = key.replace(API_META[api]["base"]+"/", "")
        for suffix in API_META[api]["transcripts"]:
            if id.endswith("/"+suffix):
                id = re.sub(re.escape("/"+suffix) + r"$", "", id)
        if id.endswith(ext):
            id = re.sub(re.escape(ext) + r"$", "", id)
        if id in result:
            for x, y in row.items():
                result[id][x] = y
        else:
            result[id] = row

    return result


if __name__ == '__main__':

    start_time = time.time()

    parser = argparse.ArgumentParser(description='Analyze transcribe rate')
    parser.add_argument('--infolder','-i', action='store', default='.', help='folder containing previous ETL files')
    parser.add_argument('--outfolder','-o', action='store', default='./output', help='output directory')

    args = parser.parse_args()

    log_kv("Running", __file__)
    log_kv("From", os.path.dirname(os.path.realpath(__file__)))
    print

    inpath = os.path.realpath(args.infolder if args.infolder else os.getcwd())
    log_kv("inpath", inpath)
    outpath = os.path.realpath(args.outfolder if args.outfolder else u'./output')
    log_kv("outpath", outpath)
    make_dir(outpath)

    log_kv("")
    log_kv("IBM log stats", IBM_LOG_STATS_FILEPATH)
    log_kv("IBM transcript stats", IBM_WORD_COUNT_FILEPATH)
    log_kv("Google log stats", GOOGLE_LOG_STATS_FILEPATH)
    log_kv("Google transcript stats", GOOGLE_WORD_COUNT_FILEPATH)
    log_kv("")
    log_kv("IBM base", API_META["ibm"]["base"])
    log_kv("Google base", API_META["google"]["base"])
    log_kv("")
    log_kv("tsv filename", TRANSCRIBED_STATS_TSV)

    print
    log_kv("Loading IBM")
    ibm_data = etl_transcripts(os.path.join(inpath, IBM_LOG_STATS_FILEPATH),
                               os.path.join(inpath,IBM_WORD_COUNT_FILEPATH),
                              "ibm")
    log_kv("Count(IBM)", len(ibm_data))
    print
    log_kv("Loading Google")
    google_data = etl_transcripts(os.path.join(inpath, GOOGLE_LOG_STATS_FILEPATH),
                                  os.path.join(inpath, GOOGLE_WORD_COUNT_FILEPATH),
                              "google")
    log_kv("Count(Google)", len(google_data))

    print
    keys = set(ibm_data.keys()).union(set(google_data.keys()))
    log_kv("Number of keys in both", len(keys))

    tsv_fields = set(["etl_time"])
    tsv_data = defaultdict(dict)
    for key in keys:
        if key in ibm_data:
            for x,y in ibm_data[key].items():
                tsv_data[key]["ibm_" + x] = y
                tsv_fields.add("ibm_" + x)
        if key in google_data:
            for x, y in google_data[key].items():
                tsv_data[key]["google_" + x] = y
                tsv_fields.add("google_" + x)
        tsv_data[key]["etl_time"] = datetime.datetime.now().isoformat()


    tsv_keys = sorted(tsv_data.keys())
    tsv_fields = sorted(tsv_fields)
    print "Fields:\npath,", ",".join(tsv_fields)

    #   Writes line data
    tsv_path = os.path.join(args.outfolder,TRANSCRIBED_STATS_TSV)
    log_kv("Writing", tsv_path)
    with open(tsv_path, 'w') as fp:
        fp.write("path\t")
        for field in tsv_fields:
            fp.write("\t"+field if tsv_fields.index(field) else field)
        fp.write("\n")
        for key in tsv_keys:
            fp.write('"%s"\t' % key)
            for field in tsv_fields:

                value = tsv_data[key][field] if field in tsv_data[key] else None
                value = "" if not value else '"%s"' % value if type(value) is str else "%s" % value
                fp.write("\t"+value if tsv_fields.index(field) else value)
            fp.write("\n")

    log_kv("Done:              ", __file__)
    print("(%.1f min)" % ((time.time() - start_time) / 60.0))