内容介绍

1、项目介绍

MAX78000开发板是一款强大的低功耗人工智能解决方案，硬件上集成了卷积神经网络(CNN)加速器，允许用户通过PyTorch和TensorFlow等传统工具集训练网络，非常适合实现语音控制和人机交互功能。本项目旨在利用MAX78000开发板构建一个能够通过语音命令完成人机交互的原型系统。

2、项目设计思路

准备语音指令的数据集，用于AI模型训练
通过PyTorch训练指定的数据集，生成语言模型文件，实现针对特定命令的语音识别
编写适用于神经网络加速器的语音识别算法
开发语音识别交互逻辑程序
使用MAX78000开发板的麦克风作为语音指令的输入源，利用芯片内部的神经网络加速器进行实时语音识别，并将语音指令转换为文本
分析文本指令，驱动相关设备，并执行相应任务

3、搜集素材的思路

语料：Common Voice (mozilla.org)、EdgeTTS、人工录音

3.1 使用TTS方式生成语料

import os, re, json, sys
import subprocess
from pydub import AudioSegment
from pydub.silence import split_on_silence
from pypinyin import pinyin, Style
import wave
import concurrent.futures


def convert_mp3_to_wav(input_file, output_file, sample_width=2, channels=1, frame_rate=16000):
    # 读取MP3文件
    audio = AudioSegment.from_file(input_file, format="mp3")

    # 设置参数
    audio = audio.set_frame_rate(frame_rate)
    audio = audio.set_sample_width(sample_width)
    audio = audio.set_channels(channels)

    # 将MP3文件保存为WAV文件
    audio.export(output_file, format="wav")

def get_audio_info(audio_file):
    print(audio_file)
    # with wave.open(audio_file, 'rb') as wf:
    #     print("Channels:", wf.getnchannels())
    #     print("Sample Width:", wf.getsampwidth())
    #     print("Frame Rate:", wf.getframerate())
    #     print("Frames:", wf.getnframes())
    #     print("Compression Type:", wf.getcomptype())
    #     print("Compression Name:", wf.getcompname())

    audio = AudioSegment.from_file(audio_file)
    # 获取音频的采样率
    sample_rate = audio.frame_rate
    # 获取音频的通道数
    channels = audio.channels
    # 获取音频的比特率
    bit_rate = audio.frame_width * 8 * sample_rate * channels / 1000
    # 获取音频的时长（秒）
    duration = len(audio) / 1000
    print("采样率: {} Hz".format(sample_rate))
    print("通道数: {}".format(channels))
    print("比特率: {:.2f} kbps".format(bit_rate))
    print("时长: {:.2f} 秒".format(duration))


def chinese_to_pinyin(chinese_text):
    # 将中文文本转换为拼音列表
    pinyin_list = pinyin(chinese_text, style=Style.NORMAL)

    # 将拼音列表连接成字符串
    pinyin_str = ''.join([item[0] for item in pinyin_list])

    return pinyin_str

def gen_voice_file(output_path:str, convert_path:str, name:str, index:int, model:str, speed:int, spd_index:int):
    key = chinese_to_pinyin(name)
    gen_file_path = os.path.join(output_path, f'{key}_{index}_RATE{spd_index}.mp3')

    # self.tts(text=key, output=output_path, lang=self.getLanguage(key))
    voice_speed = '' if speed >= 0 else f' --rate={speed}%'
    # voice_volume = f'--volume={voice_volume}%'
    convert_cmd = f'edge-tts{voice_speed} --voice {model} --text "{name}" --write-media {gen_file_path}'
    print(convert_cmd)
    process = subprocess.Popen(convert_cmd,
                               shell=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.STDOUT,
                               encoding='utf-8')
    process.wait()
    # print(str(process.stdout.read()))
    if process.stderr is not None:
        print(str(process.stderr.read()))
    else:
        convert_name = os.path.join(convert_path, f'{key}_{index}_RATE{spd_index}_convert.wav')
        convert_mp3_to_wav(gen_file_path, convert_name)
        get_audio_info(convert_name)


def cmd2voice(output_path:str, cmds: list):
    spd_list = [-5, -10, -15, -20, -25, -30, 0, 5, 10, 15, 20, 25, 30]
    voice_model_list = [
        'zh-CN-XiaoxiaoNeural',
        'zh-CN-XiaoyiNeural',
        'zh-CN-YunjianNeural',
        'zh-CN-YunxiNeural',
        'zh-CN-YunxiaNeural',
        'zh-CN-YunyangNeural',
        'zh-CN-liaoning-XiaobeiNeural',
        'zh-CN-shaanxi-XiaoniNeural',
        'zh-TW-HsiaoChenNeural',
        'zh-TW-HsiaoYuNeural',
        'zh-TW-YunJheNeural',]

    if not os.path.exists(output_path):
        os.mkdir(output_path)

    for item in cmds:
        key = chinese_to_pinyin(item)
        file_path = os.path.join(output_path, key)

        if not os.path.exists(file_path):
            os.mkdir(file_path)
        else:
            for filename in os.listdir(file_path):
                os.remove(os.path.join(file_path, filename))

        convert_path = os.path.join(output_path, f'zh_{key}')
        if not os.path.exists(convert_path):
            os.mkdir(convert_path)
        else:
            for filename in os.listdir(convert_path):
                os.remove(os.path.join(convert_path, filename))

        with concurrent.futures.ThreadPoolExecutor(max_workers=len(voice_model_list)*len(spd_list)) as executor:
            # 提交任务到线程池
            for index, voice_model in enumerate(voice_model_list):
                for spd_index, spd in enumerate(spd_list):
                    executor.submit(gen_voice_file, file_path, convert_path, item, index, voice_model, spd, spd_index)

if __name__ == '__main__':
    cmd2voice('tts_gen', ['是的', '不是', '好的', '打开', '关闭', '确定', '返回'])

3.2 人工录音方式生成语料

FqNp7KG5Byzsyef-xEujdKjgo40c

3.3 音频文件格式转换

使用pytorch训练语音识别模型对语料的要求如下：

速率为16KHz
编码为16位小端PCM编码方式
单声道
文件格式为wav

由于手动转换音频文件格式过于麻烦，我们使用python实现一键自动转换，以下是python实现方式：

from pydub import AudioSegment
from pydub.silence import split_on_silence
from pypinyin import pinyin, Style
import os
import wave


def get_audio_info(audio_file):
    print(audio_file)
    with wave.open(audio_file, 'rb') as wf:
        print("Channels:", wf.getnchannels())
        print("Sample Width:", wf.getsampwidth())
        print("Frame Rate:", wf.getframerate())
        print("Frames:", wf.getnframes())
        print("Compression Type:", wf.getcomptype())
        print("Compression Name:", wf.getcompname())

    audio = AudioSegment.from_file(audio_file)
    # 获取音频的采样率
    sample_rate = audio.frame_rate
    # 获取音频的通道数
    channels = audio.channels
    # 获取音频的比特率
    bit_rate = audio.frame_width * 8 * sample_rate * channels / 1000
    # 获取音频的时长（秒）
    duration = len(audio) / 1000
    print("采样率: {} Hz".format(sample_rate))
    print("通道数: {}".format(channels))
    print("比特率: {:.2f} kbps".format(bit_rate))
    print("时长: {:.2f} 秒".format(duration))

def convert_wav(input_file, output_file, sample_width=2, channels=1, frame_rate=16000):
    # 读取MP3文件
    audio = AudioSegment.from_file(input_file, format="wav")

    # 设置参数
    audio = audio.set_frame_rate(frame_rate)
    audio = audio.set_sample_width(sample_width)
    audio = audio.set_channels(channels)
    # audio = audio + 60

    # 将MP3文件保存为WAV文件
    audio.export(output_file, format="wav")

# # 替换为实际的音频文件路径
# for i in range(1, 6):
#     file_name = f'{i}.wav'
#     get_audio_info(file_name)


def chinese_to_pinyin(chinese_text):
    # 将中文文本转换为拼音列表
    pinyin_list = pinyin(chinese_text, style=Style.NORMAL)

    # 将拼音列表连接成字符串
    pinyin_str = ''.join([item[0] for item in pinyin_list])

    return pinyin_str


def split_audio_on_silence(audio_file, silence_threshold=-55):
    base_name = os.path.basename(audio_file)
    base_name = os.path.splitext(base_name)[0]
    base_name_en = chinese_to_pinyin(base_name)
    out_path = os.path.join(os.getcwd(), base_name_en)
    convert_path = os.path.join(out_path, 'convert')

    if os.path.exists(convert_path):
        for filename in os.listdir(convert_path):
            os.remove(os.path.join(convert_path, filename))

        os.rmdir(convert_path)

    if not os.path.exists(out_path):
        os.mkdir(out_path)
        os.mkdir(convert_path)

    else:
        for filename in os.listdir(out_path):
            os.remove(os.path.join(out_path, filename))

        os.rmdir(out_path)
        os.mkdir(out_path)
        os.mkdir(convert_path)


    # 加载音频文件
    audio = AudioSegment.from_file(audio_file)

    # 切割音频文件，基于静默部分
    audio_chunks = split_on_silence(audio, silence_thresh=silence_threshold)

    # 保存切割后的音频文件
    for i, chunk in enumerate(audio_chunks):
        out_name = os.path.join(out_path, f'output_chunk_{i}.wav')
        chunk.export(out_name, format="wav")
        convert_name = os.path.join(convert_path, f'output_chunk_{i}.wav')
        convert_wav(out_name, convert_name)
        get_audio_info(convert_name)

# 替换为实际的音频文件路径
for i in ['不是.wav', '是的.wav']:
    if os.path.exists(i):
        split_audio_on_silence(i)

4、预训练实现过程

检查windows 安装的cuda版本,可通过以下指令查询，通常系统安装过nvidia显卡驱动自带cuda

nvidia-smi

FvKOzXd6sFtUiYhAH52hp_4h0rMM

2. 下载windows平台cuda同版本的ubuntu wsl环境下的cuda toolkit

FryWZ6YV1K5gBHFd5Sbi5X5N4o6B

通过以下指令安装

wget https://developer.download.nvidia.com/compute/cuda/repos/wsl-ubuntu/x86_64/cuda-wsl-ubuntu.pin
sudo mv cuda-wsl-ubuntu.pin /etc/apt/preferences.d/cuda-repository-pin-600
wget https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda-repo-wsl-ubuntu-11-7-local_11.7.1-1_amd64.deb
sudo dpkg -i cuda-repo-wsl-ubuntu-11-7-local_11.7.1-1_amd64.deb
sudo cp /var/cuda-repo-wsl-ubuntu-11-7-local/cuda-*-keyring.gpg /usr/share/keyrings/
sudo apt-get update
sudo apt-get -y install cuda

3. 安装MiniConda3

下载链接：Miniconda — miniconda documentation

Fs8u_qdZZb8jpN6ta5f7EunfzOo9

将下载好的文件放到wsl2中，并运行，启动时若提示权限不够，可通过chmod指令为该文件添加可执行权限。

启动conda

# 启动conda
source ~/miniconda3/bin/activate

添加conda镜像源

conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda
conda config --set show_channel_urls true

创建python 环境，指定python版本号为3.8.11

conda create -n py38 python=3.8.11

使用 conda activate py38 使能刚刚安装好的py38环境

4. 拉取MaximIntegratedAI代码

git clone --recursive https://github.com/MaximIntegratedAI/ai8x-synthesis.git
git clone --recursive https://github.com/MaximIntegratedAI/ai8x-training.git

进入到ai8x-synthesis目录下，安装使用以下指令安装环境所需的包

pip install -r requirements.txt -i https://pypi.douban.com/simple/

进入到ai8x-training目录下，安装使用以下指令安装环境所需的包

pip install -r requirements-cu11.txt -i https://pypi.douban.com/simple/

5. 训练模型

 ./scripts/train_kws20.sh

5、训练过程

5.1 增加语料

将生成的录音文件放置到以下路径，本次主要增加"是的"、"不是"，这两个单词的语料（包含录音以及tts生成文件），其他指令如"打开"、”关闭“、”确定“等为测试使用

\ai8x-training\data\KWS\raw

5.2 修改训练脚本

我们打开”/ai8x-training/datasets/kws20.py“，修改以下部分

修改class_dict

FgQhuSt-XV1WnoHKRoFLMhkXV1aR

2. 修改datesets

Fvlb-kCXnUpSinw4bW553GOz__ck

3. 修改语音识别指令数量

FsoU9TjOp7pDWF2RGFZHeg96wGel Fk4LEWb6aP_6LMSRaqZhvKTmV5wc

打开”/ai8x-training/models/ai85net-kws20.py“

Fr22hYBYLl6Z_TZHxdB5SLXImPVp

5.3 开始训练

为方便操作，我们编写了train_model.sh脚本，实现一键训练，脚本内容如下所示：

cd ~/MAX78000/ai8x-training
echo "切换路径: $(pwd)"

file_path="$(pwd)/data/KWS/processed/dataset2.pt"
if [ -e "$file_path" ]; then
    echo "文件dataset2.pt存在,请手动删除它"
    # rm "$file_path"
else
    echo "文件dataset2.pt不存在"
fi

ulimit -n 65536

free -h

python train.py --epochs 200 --optimizer Adam --lr 0.001 --wd 0 --deterministic --compress policies/schedule_kws20.yaml --model ai85kws20net --dataset KWS_20 --confusion --device MAX78000 "$@"

在MAX78000根目录下输入以下指令，开始训练

./train_model.sh

FrkV0XnyKskXsNOHUi8-uvSgW-JJ

出现以下提示说明训练完成

5.4 生成代码

为方便操作，我们编写了pack.sh脚本，实现一键打包，脚本内容如下所示：

echo "0. 环境初始化"
ulimit -n 65536

echo "1. 量化"
cd ~/MAX78000/ai8x-synthesis
echo "当前路径: $(pwd)"
python quantize.py trained/qat_best.pth.tar trained/qat_best-q.pth.tar --device MAX78000 -v "$@"


echo "2. 评估"
cd ~/MAX78000/ai8x-training
echo "切换路径: $(pwd)"

python train.py --model ai85kws20net --dataset KWS_20 --confusion --evaluate --exp-load-weights-from ../ai8x-synthesis/trained/qat_best-q.pth.tar -8 --device MAX78000 "$@"

echo "3. 生成demo"
cd ~/MAX78000/ai8x-synthesis
echo "切换路径: $(pwd)"

directory_path="$(pwd)/sdk/Examples/MAX78000/CNN"
if [ -d "$directory_path/kws20_custom_demo" ]; then
    echo "文件夹 'kws20_custom_demo' 存在，将删除它"
    rm -r "$directory_path/kws20_custom_demo"
fi

free -h

DEVICE="MAX78000"
TARGET="sdk/Examples/$DEVICE/CNN"
COMMON_ARGS="--device $DEVICE --timer 0 --display-checkpoint --verbose"
python ai8xize.py --test-dir $TARGET --prefix kws20_custom_demo --checkpoint-file trained/qat_best-q.pth.tar --config-file networks/kws20-hwc.yaml --softmax $COMMON_ARGS "$@"

FldY4Z-GBGKQWG6FKJeoJws6ivnM

出现以下界面说明生成完成

5.5 功能调试

新建CNN工程

FunHTYVBPYlIKGy4DRETlaE4cqnd Fh6YtJoUMdEJIa_gv2oPTGpLmKtq FqJmarhTg2DfZ6GubXsKmhyYQaV5

2. 更新模型文件

复制以下”\ai8x-synthesis\sdk\Examples\MAX78000\CNN\kws20_custom_demo“

Fgn8ibmOSfgOn0g0diZ_TqRZsAQL

3. 修改指令描述字符串

FpYrAt1p485yEw09ZB_-oSQQzVPI

4. 下载运行

按下快捷键ctrl+b开始编译

17:44:43 **** Incremental Build of configuration Default for project kws20 ****
make -r -j 8 --output-sync=target --no-print-directory 
Loaded project.mk
D:/MaximSDK/Libraries/SDHC/ff13/fat32.mk:42: Warning: Building with FatFS R0.13.  The MSDK will update the default FatFS version to R0.15 next release.  See https://github.com/Analog-Devices-MSDK/msdk/pull/720
****************************************************************************
* Analog Devices MSDK
* - User Guide: https://analog-devices-msdk.github.io/msdk/USERGUIDE/
* - Get Support: https://www.analog.com/support/technical-support.html
* - Report Issues: https://github.com/Analog-Devices-MSDK/msdk/issues
* - Contributing: https://analog-devices-msdk.github.io/msdk/CONTRIBUTING/
****************************************************************************
/usr/bin/make -C D:/MaximSDK/Libraries/SDHC/ff13 lib BUILD_DIR=/d/Project/MAX78000/kws20/build/Fat32Driver BOARD=FTHR_RevA
/usr/bin/make -C D:/MaximSDK/Libraries/SDHC lib BUILD_DIR=/d/Project/MAX78000/kws20/build/SDHCDriver BOARD=FTHR_RevA
make[1]: Nothing to be done for 'lib'.
make[1]: Nothing to be done for 'lib'.
arm-none-eabi-size --format=berkeley /d/Project/MAX78000/kws20/build/kws20.elf
   text	   data	    bss	    dec	    hex	filename
 368248	   2512	  35784	 406544	  63410	D:/Project/MAX78000/kws20/build/kws20.elf

17:44:44 Build Finished. 0 errors, 1 warnings. (took 353ms)

点击IDE左上角的运行按钮，烧录并执行程序

FuDaKqfYKsvHtyXHuZSvwGfzMr4o

打开串口助手，连接MAX78000开发板的串口，喊"是的","不是"，从以下日志可以看到，指令识别成功：

ANALOG DEVICES 
Keyword Spotting Demo
Ver. 3.2.3 (5/05/23) 

***** Init *****
pChunkBuff: 128
pAI85Buffer: 16384

*** I2S & Mic Init ***

*** READY ***
Word starts from index 3840 to 10496, padded with 5888 zeros, avg:458 > 350 
026880: Starts CNN: 1
026880: Completes CNN: 1
CNN Time: 2584 us
Min: -34,   Max: 45 
----------------------------------------- 
Detected word: zh_shide (96.5%)
----------------------------------------- 
Word starts from index 1920 to 11008, padded with 3456 zeros, avg:413 > 350 
054272: Starts CNN: 2
054272: Completes CNN: 2
CNN Time: 2584 us
Min: -20,   Max: 27 
----------------------------------------- 
Detected word: zh_bushi (88.3%)
-----------------------------------------