2つの音声ファイルを比較して同一人物か判定する

バージョン依存で色々エラーになるので、覚え書き。

import torch
import torchaudio
import huggingface_hub
from huggingface_hub import hf_hub_download as _orig_hf_hub_download


# --- ここから互換ラッパー ------------------------
def _compat_hf_hub_download(*args, use_auth_token=None, token=None, **kwargs):
    """
    speechbrain が古い API (use_auth_token) で呼んできても、
    新しい huggingface_hub 側の token 引数にマッピングするためのラッパー。
    """
    # use_auth_token が指定されていて、token がまだ None のときはマッピング
    if token is None and use_auth_token is not None:
        # True/False のときは「デフォルトトークンを使うかどうか」という意味だが、
        # 今回の公開モデル (speechbrain/spkrec-ecapa-voxceleb) では不要なので、
        # 文字列のときだけ token に渡す。
        if isinstance(use_auth_token, str):
            token = use_auth_token

    return _orig_hf_hub_download(*args, token=token, **kwargs)


# モジュールにパッチを当てる
huggingface_hub.hf_hub_download = _compat_hf_hub_download
# --- 互換ラッパーここまで ------------------------

# Warning 対応：pretrained は deprecated なので inference から import する
from speechbrain.inference import SpeakerRecognition


def main():
    # 事前学習済みの話者認識モデルを読み込み
    verification = SpeakerRecognition.from_hparams(
        source="speechbrain/spkrec-ecapa-voxceleb",
        savedir="pretrained_models/spkrec-ecapa-voxceleb",
    )

    # 2つの音声ファイルを読み込み
    sig1, sr1 = torchaudio.load("voice1.webm")
    sig2, sr2 = torchaudio.load("voice2.webm")

    # 同一話者かどうかのスコアを計算
    score, prediction = verification.verify_files("voice1.webm", "voice2.webm")

    print("score:", score)
    print("same speaker? ->", bool(prediction))


if __name__ == "__main__":
    main()

uv add requests torch torchaudio torchcodec
uv pip install "speechbrain @ git+https://github.com/speechbrain/speechbrain.git@develop"
uv pip install "huggingface-hub<1.0.0"
uv run main.py

あのVtuberの中の人、あの人なのかぁ……。

関連