diff --git a/README.md b/README.md index 3ab7fd5..c043081 100644 --- a/README.md +++ b/README.md @@ -101,7 +101,7 @@ $ python3 client_http.py 请注意,开启API服务器之后,需要使用本ASRT项目对应的客户端软件来进行语音识别,详见Wiki文档[下载ASRT语音识别客户端SDK和Demo](https://wiki.ailemon.net/docs/asrt-doc/download)。 -如果要训练和使用非251版模型,请在代码中 `import speech_model_zoo` 的相应位置做修改。 +如果要训练和使用非251bn版模型,请在代码中 `import speech_model_zoo` 的相应位置做修改。 使用docker直接部署ASRT: ```shell diff --git a/README_EN.md b/README_EN.md index 3cee922..c0feee1 100644 --- a/README_EN.md +++ b/README_EN.md @@ -97,7 +97,7 @@ To test whether it is successful or not that calls api service interface: $ python3 client_http.py ``` -If you want to train and use other model(not Model 251), make changes in the corresponding position of the `import speech_model_zoo` in the code files. +If you want to train and use other model(not Model 251bn), make changes in the corresponding position of the `import speech_model_zoo` in the code files. If there is any problem during the execution of the program or during use, it can be promptly put forward in the issue, and I will reply as soon as possible. diff --git a/asrserver.py b/asrserver.py index 781a994..f3f8489 100644 --- a/asrserver.py +++ b/asrserver.py @@ -26,7 +26,7 @@ import http.server import socket from speech_model import ModelSpeech -from speech_model_zoo import SpeechModel251 +from speech_model_zoo import SpeechModel251BN from speech_features import Spectrogram from LanguageModel2 import ModelLanguage @@ -35,13 +35,13 @@ CHANNELS = 1 # 默认输出的拼音的表示大小是1428,即1427个拼音+1个空白块 OUTPUT_SIZE = 1428 -sm251 = SpeechModel251( +sm251bn = SpeechModel251BN( input_shape=(AUDIO_LENGTH, AUDIO_FEATURE_LENGTH, CHANNELS), output_size=OUTPUT_SIZE ) feat = Spectrogram() -ms = ModelSpeech(sm251, feat, max_label_length=64) -ms.load_model('save_models/' + sm251.get_model_name() + '.model.h5') +ms = ModelSpeech(sm251bn, feat, max_label_length=64) +ms.load_model('save_models/' + sm251bn.get_model_name() + '.model.h5') ml = ModelLanguage('model_language') ml.LoadModel() diff --git a/asrserver_http.py b/asrserver_http.py index a5761f5..91ae0e6 100644 --- a/asrserver_http.py +++ b/asrserver_http.py @@ -23,12 +23,13 @@ ASRT语音识别基于HTTP协议的API服务器程序 """ +import argparse import base64 import json from flask import Flask, Response, request from speech_model import ModelSpeech -from speech_model_zoo import SpeechModel251 +from speech_model_zoo import SpeechModel251BN from speech_features import Spectrogram from LanguageModel2 import ModelLanguage from utils.ops import decode_wav_bytes @@ -36,10 +37,15 @@ API_STATUS_CODE_OK = 200000 # OK API_STATUS_CODE_CLIENT_ERROR = 400000 API_STATUS_CODE_CLIENT_ERROR_FORMAT = 400001 # 请求数据格式错误 -API_STATUS_CODE_CLIENT_ERROR_FORMAT = 400002 # 请求数据配置不支持 +API_STATUS_CODE_CLIENT_ERROR_CONFIG = 400002 # 请求数据配置不支持 API_STATUS_CODE_SERVER_ERROR = 500000 API_STATUS_CODE_SERVER_ERROR_RUNNING = 500001 # 服务器运行中出错 +parser = argparse.ArgumentParser(description='ASRT HTTP+Json RESTful API Service') +parser.add_argument('--listen', default='0.0.0.0', type=str, help='the network to listen') +parser.add_argument('--port', default='20001', type=str, help='the port to listen') +args = parser.parse_args() + app = Flask("ASRT API Service") AUDIO_LENGTH = 1600 @@ -47,13 +53,13 @@ CHANNELS = 1 # 默认输出的拼音的表示大小是1428,即1427个拼音+1个空白块 OUTPUT_SIZE = 1428 -sm251 = SpeechModel251( +sm251bn = SpeechModel251BN( input_shape=(AUDIO_LENGTH, AUDIO_FEATURE_LENGTH, CHANNELS), output_size=OUTPUT_SIZE ) feat = Spectrogram() -ms = ModelSpeech(sm251, feat, max_label_length=64) -ms.load_model('save_models/' + sm251.get_model_name() + '.model.h5') +ms = ModelSpeech(sm251bn, feat, max_label_length=64) +ms.load_model('save_models/' + sm251bn.get_model_name() + '.model.h5') ml = ModelLanguage('model_language') ml.LoadModel() @@ -149,7 +155,7 @@ def recognition_post(level): json_data = AsrtApiResponse(API_STATUS_CODE_OK, 'all level') json_data.result = result buffer = json_data.to_json() - print('output:', buffer) + print('ASRT Result:', result,'output:', buffer) return Response(buffer, mimetype='application/json') else: request_data = request.get_json() @@ -165,6 +171,8 @@ def recognition_post(level): # request_data['samples'][-100:]) json_data = AsrtApiResponse(API_STATUS_CODE_SERVER_ERROR, str(except_general)) buffer = json_data.to_json() + #print("input:", request_data, "\n", "output:", buffer) + print("output:", buffer, "error:", except_general) return Response(buffer, mimetype='application/json') @@ -173,4 +181,4 @@ def recognition_post(level): #app.run(host='0.0.0.0', port=20001) # for production env import waitress - waitress.serve(app, host='0.0.0.0', port=20001) + waitress.serve(app, host=args.listen, port=args.port) diff --git a/evaluate_speech_model.py b/evaluate_speech_model.py index 80b8036..f2c3294 100644 --- a/evaluate_speech_model.py +++ b/evaluate_speech_model.py @@ -26,7 +26,7 @@ import os from speech_model import ModelSpeech -from speech_model_zoo import SpeechModel251 +from speech_model_zoo import SpeechModel251BN from data_loader import DataLoader from speech_features import Spectrogram @@ -37,14 +37,14 @@ CHANNELS = 1 # 默认输出的拼音的表示大小是1428,即1427个拼音+1个空白块 OUTPUT_SIZE = 1428 -sm251 = SpeechModel251( +sm251bn = SpeechModel251BN( input_shape=(AUDIO_LENGTH, AUDIO_FEATURE_LENGTH, CHANNELS), output_size=OUTPUT_SIZE ) feat = Spectrogram() evalue_data = DataLoader('dev') -ms = ModelSpeech(sm251, feat, max_label_length=64) +ms = ModelSpeech(sm251bn, feat, max_label_length=64) -ms.load_model('save_models/' + sm251.get_model_name() + '.model.h5') +ms.load_model('save_models/' + sm251bn.get_model_name() + '.model.h5') ms.evaluate_model(data_loader=evalue_data, data_count=-1, out_report=True, show_ratio=True, show_per_step=100) diff --git a/predict_speech_file.py b/predict_speech_file.py index fc2abbb..ab302bc 100644 --- a/predict_speech_file.py +++ b/predict_speech_file.py @@ -26,7 +26,7 @@ import os from speech_model import ModelSpeech -from speech_model_zoo import SpeechModel251 +from speech_model_zoo import SpeechModel251BN from speech_features import Spectrogram from LanguageModel2 import ModelLanguage @@ -37,14 +37,14 @@ CHANNELS = 1 # 默认输出的拼音的表示大小是1428,即1427个拼音+1个空白块 OUTPUT_SIZE = 1428 -sm251 = SpeechModel251( +sm251bn = SpeechModel251BN( input_shape=(AUDIO_LENGTH, AUDIO_FEATURE_LENGTH, CHANNELS), output_size=OUTPUT_SIZE ) feat = Spectrogram() -ms = ModelSpeech(sm251, feat, max_label_length=64) +ms = ModelSpeech(sm251bn, feat, max_label_length=64) -ms.load_model('save_models/' + sm251.get_model_name() + '.model.h5') +ms.load_model('save_models/' + sm251bn.get_model_name() + '.model.h5') res = ms.recognize_speech_from_file('filename.wav') print('*[提示] 声学模型语音识别结果:\n', res) diff --git a/train_speech_model.py b/train_speech_model.py index d1596e7..feda74c 100644 --- a/train_speech_model.py +++ b/train_speech_model.py @@ -28,9 +28,9 @@ from tensorflow.keras.optimizers import Adam from speech_model import ModelSpeech -from speech_model_zoo import SpeechModel251 +from speech_model_zoo import SpeechModel251BN from data_loader import DataLoader -from speech_features import Spectrogram +from speech_features import SpecAugment os.environ["CUDA_VISIBLE_DEVICES"] = "0" @@ -39,16 +39,16 @@ CHANNELS = 1 # 默认输出的拼音的表示大小是1428,即1427个拼音+1个空白块 OUTPUT_SIZE = 1428 -sm251 = SpeechModel251( +sm251bn = SpeechModel251BN( input_shape=(AUDIO_LENGTH, AUDIO_FEATURE_LENGTH, CHANNELS), output_size=OUTPUT_SIZE ) -feat = Spectrogram() +feat = SpecAugment() train_data = DataLoader('train') opt = Adam(lr = 0.0001, beta_1 = 0.9, beta_2 = 0.999, decay = 0.0, epsilon = 10e-8) -ms = ModelSpeech(sm251, feat, max_label_length=64) +ms = ModelSpeech(sm251bn, feat, max_label_length=64) -#ms.load_model('save_models/' + sm251.get_model_name() + '.model.h5') +#ms.load_model('save_models/' + sm251bn.get_model_name() + '.model.h5') ms.train_model(optimizer=opt, data_loader=train_data, epochs=50, save_step=1, batch_size=16, last_epoch=0) -ms.save_model('save_models/' + sm251.get_model_name()) +ms.save_model('save_models/' + sm251bn.get_model_name())