1、项目介绍
MAX78000开发板是一款强大的低功耗人工智能解决方案,硬件上集成了卷积神经网络(CNN)加速器,允许用户通过PyTorch和TensorFlow等传统工具集训练网络,非常适合实现语音控制和人机交互功能。本项目旨在利用MAX78000开发板构建一个能够通过语音命令完成人机交互的原型系统。
2、项目设计思路
- 准备语音指令的数据集,用于AI模型训练
- 通过PyTorch训练指定的数据集,生成语言模型文件,实现针对特定命令的语音识别
- 编写适用于神经网络加速器的语音识别算法
- 开发语音识别交互逻辑程序
- 使用MAX78000开发板的麦克风作为语音指令的输入源,利用芯片内部的神经网络加速器进行实时语音识别,并将语音指令转换为文本
- 分析文本指令,驱动相关设备,并执行相应任务
3、搜集素材的思路
语料:Common Voice (mozilla.org)、EdgeTTS、人工录音
3.1 使用TTS方式生成语料
import os, re, json, sys
import subprocess
from pydub import AudioSegment
from pydub.silence import split_on_silence
from pypinyin import pinyin, Style
import wave
import concurrent.futures
def convert_mp3_to_wav(input_file, output_file, sample_width=2, channels=1, frame_rate=16000):
# 读取MP3文件
audio = AudioSegment.from_file(input_file, format="mp3")
# 设置参数
audio = audio.set_frame_rate(frame_rate)
audio = audio.set_sample_width(sample_width)
audio = audio.set_channels(channels)
# 将MP3文件保存为WAV文件
audio.export(output_file, format="wav")
def get_audio_info(audio_file):
print(audio_file)
# with wave.open(audio_file, 'rb') as wf:
# print("Channels:", wf.getnchannels())
# print("Sample Width:", wf.getsampwidth())
# print("Frame Rate:", wf.getframerate())
# print("Frames:", wf.getnframes())
# print("Compression Type:", wf.getcomptype())
# print("Compression Name:", wf.getcompname())
audio = AudioSegment.from_file(audio_file)
# 获取音频的采样率
sample_rate = audio.frame_rate
# 获取音频的通道数
channels = audio.channels
# 获取音频的比特率
bit_rate = audio.frame_width * 8 * sample_rate * channels / 1000
# 获取音频的时长(秒)
duration = len(audio) / 1000
print("采样率: {} Hz".format(sample_rate))
print("通道数: {}".format(channels))
print("比特率: {:.2f} kbps".format(bit_rate))
print("时长: {:.2f} 秒".format(duration))
def chinese_to_pinyin(chinese_text):
# 将中文文本转换为拼音列表
pinyin_list = pinyin(chinese_text, style=Style.NORMAL)
# 将拼音列表连接成字符串
pinyin_str = ''.join([item[0] for item in pinyin_list])
return pinyin_str
def gen_voice_file(output_path:str, convert_path:str, name:str, index:int, model:str, speed:int, spd_index:int):
key = chinese_to_pinyin(name)
gen_file_path = os.path.join(output_path, f'{key}_{index}_RATE{spd_index}.mp3')
# self.tts(text=key, output=output_path, lang=self.getLanguage(key))
voice_speed = '' if speed >= 0 else f' --rate={speed}%'
# voice_volume = f'--volume={voice_volume}%'
convert_cmd = f'edge-tts{voice_speed} --voice {model} --text "{name}" --write-media {gen_file_path}'
print(convert_cmd)
process = subprocess.Popen(convert_cmd,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
encoding='utf-8')
process.wait()
# print(str(process.stdout.read()))
if process.stderr is not None:
print(str(process.stderr.read()))
else:
convert_name = os.path.join(convert_path, f'{key}_{index}_RATE{spd_index}_convert.wav')
convert_mp3_to_wav(gen_file_path, convert_name)
get_audio_info(convert_name)
def cmd2voice(output_path:str, cmds: list):
spd_list = [-5, -10, -15, -20, -25, -30, 0, 5, 10, 15, 20, 25, 30]
voice_model_list = [
'zh-CN-XiaoxiaoNeural',
'zh-CN-XiaoyiNeural',
'zh-CN-YunjianNeural',
'zh-CN-YunxiNeural',
'zh-CN-YunxiaNeural',
'zh-CN-YunyangNeural',
'zh-CN-liaoning-XiaobeiNeural',
'zh-CN-shaanxi-XiaoniNeural',
'zh-TW-HsiaoChenNeural',
'zh-TW-HsiaoYuNeural',
'zh-TW-YunJheNeural',]
if not os.path.exists(output_path):
os.mkdir(output_path)
for item in cmds:
key = chinese_to_pinyin(item)
file_path = os.path.join(output_path, key)
if not os.path.exists(file_path):
os.mkdir(file_path)
else:
for filename in os.listdir(file_path):
os.remove(os.path.join(file_path, filename))
convert_path = os.path.join(output_path, f'zh_{key}')
if not os.path.exists(convert_path):
os.mkdir(convert_path)
else:
for filename in os.listdir(convert_path):
os.remove(os.path.join(convert_path, filename))
with concurrent.futures.ThreadPoolExecutor(max_workers=len(voice_model_list)*len(spd_list)) as executor:
# 提交任务到线程池
for index, voice_model in enumerate(voice_model_list):
for spd_index, spd in enumerate(spd_list):
executor.submit(gen_voice_file, file_path, convert_path, item, index, voice_model, spd, spd_index)
if __name__ == '__main__':
cmd2voice('tts_gen', ['是的', '不是', '好的', '打开', '关闭', '确定', '返回'])
3.2 人工录音方式生成语料
3.3 音频文件格式转换
使用pytorch训练语音识别模型对语料的要求如下:
- 速率为16KHz
- 编码为16位小端PCM编码方式
- 单声道
- 文件格式为wav
由于手动转换音频文件格式过于麻烦,我们使用python实现一键自动转换,以下是python实现方式:
from pydub import AudioSegment
from pydub.silence import split_on_silence
from pypinyin import pinyin, Style
import os
import wave
def get_audio_info(audio_file):
print(audio_file)
with wave.open(audio_file, 'rb') as wf:
print("Channels:", wf.getnchannels())
print("Sample Width:", wf.getsampwidth())
print("Frame Rate:", wf.getframerate())
print("Frames:", wf.getnframes())
print("Compression Type:", wf.getcomptype())
print("Compression Name:", wf.getcompname())
audio = AudioSegment.from_file(audio_file)
# 获取音频的采样率
sample_rate = audio.frame_rate
# 获取音频的通道数
channels = audio.channels
# 获取音频的比特率
bit_rate = audio.frame_width * 8 * sample_rate * channels / 1000
# 获取音频的时长(秒)
duration = len(audio) / 1000
print("采样率: {} Hz".format(sample_rate))
print("通道数: {}".format(channels))
print("比特率: {:.2f} kbps".format(bit_rate))
print("时长: {:.2f} 秒".format(duration))
def convert_wav(input_file, output_file, sample_width=2, channels=1, frame_rate=16000):
# 读取MP3文件
audio = AudioSegment.from_file(input_file, format="wav")
# 设置参数
audio = audio.set_frame_rate(frame_rate)
audio = audio.set_sample_width(sample_width)
audio = audio.set_channels(channels)
# audio = audio + 60
# 将MP3文件保存为WAV文件
audio.export(output_file, format="wav")
# # 替换为实际的音频文件路径
# for i in range(1, 6):
# file_name = f'{i}.wav'
# get_audio_info(file_name)
def chinese_to_pinyin(chinese_text):
# 将中文文本转换为拼音列表
pinyin_list = pinyin(chinese_text, style=Style.NORMAL)
# 将拼音列表连接成字符串
pinyin_str = ''.join([item[0] for item in pinyin_list])
return pinyin_str
def split_audio_on_silence(audio_file, silence_threshold=-55):
base_name = os.path.basename(audio_file)
base_name = os.path.splitext(base_name)[0]
base_name_en = chinese_to_pinyin(base_name)
out_path = os.path.join(os.getcwd(), base_name_en)
convert_path = os.path.join(out_path, 'convert')
if os.path.exists(convert_path):
for filename in os.listdir(convert_path):
os.remove(os.path.join(convert_path, filename))
os.rmdir(convert_path)
if not os.path.exists(out_path):
os.mkdir(out_path)
os.mkdir(convert_path)
else:
for filename in os.listdir(out_path):
os.remove(os.path.join(out_path, filename))
os.rmdir(out_path)
os.mkdir(out_path)
os.mkdir(convert_path)
# 加载音频文件
audio = AudioSegment.from_file(audio_file)
# 切割音频文件,基于静默部分
audio_chunks = split_on_silence(audio, silence_thresh=silence_threshold)
# 保存切割后的音频文件
for i, chunk in enumerate(audio_chunks):
out_name = os.path.join(out_path, f'output_chunk_{i}.wav')
chunk.export(out_name, format="wav")
convert_name = os.path.join(convert_path, f'output_chunk_{i}.wav')
convert_wav(out_name, convert_name)
get_audio_info(convert_name)
# 替换为实际的音频文件路径
for i in ['不是.wav', '是的.wav']:
if os.path.exists(i):
split_audio_on_silence(i)
4、预训练实现过程
- 检查windows 安装的cuda版本,可通过以下指令查询,通常系统安装过nvidia显卡驱动自带cuda
nvidia-smi
2. 下载windows平台cuda同版本的ubuntu wsl环境下的cuda toolkit
通过以下指令安装
wget https://developer.download.nvidia.com/compute/cuda/repos/wsl-ubuntu/x86_64/cuda-wsl-ubuntu.pin
sudo mv cuda-wsl-ubuntu.pin /etc/apt/preferences.d/cuda-repository-pin-600
wget https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda-repo-wsl-ubuntu-11-7-local_11.7.1-1_amd64.deb
sudo dpkg -i cuda-repo-wsl-ubuntu-11-7-local_11.7.1-1_amd64.deb
sudo cp /var/cuda-repo-wsl-ubuntu-11-7-local/cuda-*-keyring.gpg /usr/share/keyrings/
sudo apt-get update
sudo apt-get -y install cuda
3. 安装MiniConda3
下载链接:Miniconda — miniconda documentation
将下载好的文件放到wsl2中,并运行,启动时若提示权限不够,可通过chmod指令为该文件添加可执行权限。
启动conda
# 启动conda
source ~/miniconda3/bin/activate
添加conda镜像源
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda
conda config --set show_channel_urls true
创建python 环境,指定python版本号为3.8.11
conda create -n py38 python=3.8.11
使用 conda activate py38 使能刚刚安装好的py38环境
4. 拉取MaximIntegratedAI代码
git clone --recursive https://github.com/MaximIntegratedAI/ai8x-synthesis.git
git clone --recursive https://github.com/MaximIntegratedAI/ai8x-training.git
进入到ai8x-synthesis目录下,安装使用以下指令安装环境所需的包
pip install -r requirements.txt -i https://pypi.douban.com/simple/
进入到ai8x-training目录下,安装使用以下指令安装环境所需的包
pip install -r requirements-cu11.txt -i https://pypi.douban.com/simple/
5. 训练模型
./scripts/train_kws20.sh
5、训练过程
5.1 增加语料
将生成的录音文件放置到以下路径,本次主要增加"是的"、"不是",这两个单词的语料(包含录音以及tts生成文件),其他指令如"打开"、”关闭“、”确定“等为测试使用
\ai8x-training\data\KWS\raw
5.2 修改训练脚本
我们打开”/ai8x-training/datasets/kws20.py“, 修改以下部分
- 修改class_dict
2. 修改datesets
3. 修改语音识别指令数量
打开”/ai8x-training/models/ai85net-kws20.py“
5.3 开始训练
为方便操作,我们编写了train_model.sh脚本,实现一键训练,脚本内容如下所示:
cd ~/MAX78000/ai8x-training
echo "切换路径: $(pwd)"
file_path="$(pwd)/data/KWS/processed/dataset2.pt"
if [ -e "$file_path" ]; then
echo "文件dataset2.pt存在,请手动删除它"
# rm "$file_path"
else
echo "文件dataset2.pt不存在"
fi
ulimit -n 65536
free -h
python train.py --epochs 200 --optimizer Adam --lr 0.001 --wd 0 --deterministic --compress policies/schedule_kws20.yaml --model ai85kws20net --dataset KWS_20 --confusion --device MAX78000 "$@"
在MAX78000根目录下输入以下指令,开始训练
./train_model.sh
出现以下提示说明训练完成
5.4 生成代码
为方便操作,我们编写了pack.sh脚本,实现一键打包,脚本内容如下所示:
echo "0. 环境初始化"
ulimit -n 65536
echo "1. 量化"
cd ~/MAX78000/ai8x-synthesis
echo "当前路径: $(pwd)"
python quantize.py trained/qat_best.pth.tar trained/qat_best-q.pth.tar --device MAX78000 -v "$@"
echo "2. 评估"
cd ~/MAX78000/ai8x-training
echo "切换路径: $(pwd)"
python train.py --model ai85kws20net --dataset KWS_20 --confusion --evaluate --exp-load-weights-from ../ai8x-synthesis/trained/qat_best-q.pth.tar -8 --device MAX78000 "$@"
echo "3. 生成demo"
cd ~/MAX78000/ai8x-synthesis
echo "切换路径: $(pwd)"
directory_path="$(pwd)/sdk/Examples/MAX78000/CNN"
if [ -d "$directory_path/kws20_custom_demo" ]; then
echo "文件夹 'kws20_custom_demo' 存在,将删除它"
rm -r "$directory_path/kws20_custom_demo"
fi
free -h
DEVICE="MAX78000"
TARGET="sdk/Examples/$DEVICE/CNN"
COMMON_ARGS="--device $DEVICE --timer 0 --display-checkpoint --verbose"
python ai8xize.py --test-dir $TARGET --prefix kws20_custom_demo --checkpoint-file trained/qat_best-q.pth.tar --config-file networks/kws20-hwc.yaml --softmax $COMMON_ARGS "$@"
出现以下界面说明生成完成
5.5 功能调试
- 新建CNN工程
2. 更新模型文件
复制以下”\ai8x-synthesis\sdk\Examples\MAX78000\CNN\kws20_custom_demo“
3. 修改指令描述字符串
4. 下载运行
按下快捷键ctrl+b开始编译
17:44:43 **** Incremental Build of configuration Default for project kws20 ****
make -r -j 8 --output-sync=target --no-print-directory
Loaded project.mk
D:/MaximSDK/Libraries/SDHC/ff13/fat32.mk:42: Warning: Building with FatFS R0.13. The MSDK will update the default FatFS version to R0.15 next release. See https://github.com/Analog-Devices-MSDK/msdk/pull/720
****************************************************************************
* Analog Devices MSDK
* - User Guide: https://analog-devices-msdk.github.io/msdk/USERGUIDE/
* - Get Support: https://www.analog.com/support/technical-support.html
* - Report Issues: https://github.com/Analog-Devices-MSDK/msdk/issues
* - Contributing: https://analog-devices-msdk.github.io/msdk/CONTRIBUTING/
****************************************************************************
/usr/bin/make -C D:/MaximSDK/Libraries/SDHC/ff13 lib BUILD_DIR=/d/Project/MAX78000/kws20/build/Fat32Driver BOARD=FTHR_RevA
/usr/bin/make -C D:/MaximSDK/Libraries/SDHC lib BUILD_DIR=/d/Project/MAX78000/kws20/build/SDHCDriver BOARD=FTHR_RevA
make[1]: Nothing to be done for 'lib'.
make[1]: Nothing to be done for 'lib'.
arm-none-eabi-size --format=berkeley /d/Project/MAX78000/kws20/build/kws20.elf
text data bss dec hex filename
368248 2512 35784 406544 63410 D:/Project/MAX78000/kws20/build/kws20.elf
17:44:44 Build Finished. 0 errors, 1 warnings. (took 353ms)
点击IDE左上角的运行按钮,烧录并执行程序
打开串口助手,连接MAX78000开发板的串口,喊"是的","不是",从以下日志可以看到,指令识别成功:
ANALOG DEVICES
Keyword Spotting Demo
Ver. 3.2.3 (5/05/23)
***** Init *****
pChunkBuff: 128
pAI85Buffer: 16384
*** I2S & Mic Init ***
*** READY ***
Word starts from index 3840 to 10496, padded with 5888 zeros, avg:458 > 350
026880: Starts CNN: 1
026880: Completes CNN: 1
CNN Time: 2584 us
Min: -34, Max: 45
-----------------------------------------
Detected word: zh_shide (96.5%)
-----------------------------------------
Word starts from index 1920 to 11008, padded with 3456 zeros, avg:413 > 350
054272: Starts CNN: 2
054272: Completes CNN: 2
CNN Time: 2584 us
Min: -20, Max: 27
-----------------------------------------
Detected word: zh_bushi (88.3%)
-----------------------------------------
5.5 填写问卷
- 连接开发板
打开问卷填写软件,界面如下所示:
2. 通过语音指令,填写调查问卷(回答”是的“或”不是“)
3. 调查问卷填写完成,显示问卷结果
6、心得体会
本次有幸参加MAX78000人工智能应用设计大赛第二季活动,经过数月的紧张筹备和团队合作,我们成功完成了一个通过语音完成调查问卷的项目。感谢这次比赛,让我更好地认识了自己,也更加明确了未来的发展方向。希望未来还能有更多这样的机会,继续挑战自我,追求更高的技术高度。
十分期待下次的活动!