# Windows PowerShell
$env:HF_TOKEN = "hf_..."

# Linux/macOS
export HF_TOKEN="hf_..."

from google.colab import userdata
import os
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")

pip install pyannote.audio openai-whisper av yt-dlp scipy

Requirement already satisfied: pyannote.audio in /usr/local/lib/python3.12/dist-packages (4.0.4)
Requirement already satisfied: openai-whisper in /usr/local/lib/python3.12/dist-packages (20250625)
Requirement already satisfied: av in /usr/local/lib/python3.12/dist-packages (17.0.1)
Requirement already satisfied: yt-dlp in /usr/local/lib/python3.12/dist-packages (2026.3.17)
Requirement already satisfied: scipy in /usr/local/lib/python3.12/dist-packages (1.16.3)
Requirement already satisfied: asteroid-filterbanks>=0.4.0 in /usr/local/lib/python3.12/dist-packages (from pyannote.audio) (0.4.0)
Requirement already satisfied: einops>=0.8.1 in /usr/local/lib/python3.12/dist-packages (from pyannote.audio) (0.8.2)
Requirement already satisfied: huggingface-hub>=0.28.1 in /usr/local/lib/python3.12/dist-packages (from pyannote.audio) (1.11.0)
Requirement already satisfied: lightning>=2.4 in /usr/local/lib/python3.12/dist-packages (from pyannote.audio) (2.6.1)
Requirement already satisfied: matplotlib>=3.10.0 in /usr/local/lib/python3.12/dist-packages (from pyannote.audio) (3.10.0)
Requirement already satisfied: opentelemetry-api>=1.34.0 in /usr/local/lib/python3.12/dist-packages (from pyannote.audio) (1.42.0)
Requirement already satisfied: opentelemetry-exporter-otlp>=1.34.0 in /usr/local/lib/python3.12/dist-packages (from pyannote.audio) (1.42.0)
Requirement already satisfied: opentelemetry-sdk>=1.34.0 in /usr/local/lib/python3.12/dist-packages (from pyannote.audio) (1.42.0)
Requirement already satisfied: pyannote-core>=6.0.1 in /usr/local/lib/python3.12/dist-packages (from pyannote.audio) (6.0.1)
Requirement already satisfied: pyannote-database>=6.1.1 in /usr/local/lib/python3.12/dist-packages (from pyannote.audio) (6.1.1)
Requirement already satisfied: pyannote-metrics>=4.0.0 in /usr/local/lib/python3.12/dist-packages (from pyannote.audio) (4.1)
Requirement already satisfied: pyannote-pipeline>=4.0.0 in /usr/local/lib/python3.12/dist-packages (from pyannote.audio) (4.0.0)
Requirement already satisfied: pyannoteai-sdk>=0.3.0 in /usr/local/lib/python3.12/dist-packages (from pyannote.audio) (0.4.0)
Requirement already satisfied: pytorch-metric-learning>=2.8.1 in /usr/local/lib/python3.12/dist-packages (from pyannote.audio) (2.9.0)
Requirement already satisfied: rich>=13.9.4 in /usr/local/lib/python3.12/dist-packages (from pyannote.audio) (13.9.4)
Requirement already satisfied: safetensors>=0.5.2 in /usr/local/lib/python3.12/dist-packages (from pyannote.audio) (0.7.0)
Requirement already satisfied: torch-audiomentations>=0.12.0 in /usr/local/lib/python3.12/dist-packages (from pyannote.audio) (0.12.0)
Requirement already satisfied: torch>=2.8.0 in /usr/local/lib/python3.12/dist-packages (from pyannote.audio) (2.10.0+cu128)
Requirement already satisfied: torchaudio>=2.8.0 in /usr/local/lib/python3.12/dist-packages (from pyannote.audio) (2.10.0+cu128)
Requirement already satisfied: torchcodec>=0.7.0 in /usr/local/lib/python3.12/dist-packages (from pyannote.audio) (0.10.0+cu128)
Requirement already satisfied: torchmetrics>=1.6.1 in /usr/local/lib/python3.12/dist-packages (from pyannote.audio) (1.9.0)
Requirement already satisfied: more-itertools in /usr/local/lib/python3.12/dist-packages (from openai-whisper) (10.8.0)
Requirement already satisfied: numba in /usr/local/lib/python3.12/dist-packages (from openai-whisper) (0.65.1)
Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (from openai-whisper) (2.4.6)
Requirement already satisfied: tiktoken in /usr/local/lib/python3.12/dist-packages (from openai-whisper) (0.12.0)
Requirement already satisfied: tqdm in /usr/local/lib/python3.12/dist-packages (from openai-whisper) (4.67.3)
Requirement already satisfied: triton>=2 in /usr/local/lib/python3.12/dist-packages (from openai-whisper) (3.6.0)
Requirement already satisfied: typing-extensions in /usr/local/lib/python3.12/dist-packages (from asteroid-filterbanks>=0.4.0->pyannote.audio) (4.15.0)
Requirement already satisfied: filelock>=3.10.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.28.1->pyannote.audio) (3.29.0)
Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.28.1->pyannote.audio) (2025.3.0)
Requirement already satisfied: hf-xet<2.0.0,>=1.4.3 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.28.1->pyannote.audio) (1.4.3)
Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.28.1->pyannote.audio) (0.28.1)
Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.28.1->pyannote.audio) (26.1)
Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.28.1->pyannote.audio) (6.0.3)
Requirement already satisfied: typer in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.28.1->pyannote.audio) (0.24.2)
Requirement already satisfied: lightning-utilities<2.0,>=0.10.0 in /usr/local/lib/python3.12/dist-packages (from lightning>=2.4->pyannote.audio) (0.15.3)
Requirement already satisfied: pytorch-lightning in /usr/local/lib/python3.12/dist-packages (from lightning>=2.4->pyannote.audio) (2.6.1)
Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib>=3.10.0->pyannote.audio) (1.3.3)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib>=3.10.0->pyannote.audio) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib>=3.10.0->pyannote.audio) (4.62.1)
Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib>=3.10.0->pyannote.audio) (1.5.0)
Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib>=3.10.0->pyannote.audio) (11.3.0)
Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib>=3.10.0->pyannote.audio) (3.3.2)
Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.12/dist-packages (from matplotlib>=3.10.0->pyannote.audio) (2.9.0.post0)
Requirement already satisfied: opentelemetry-exporter-otlp-proto-grpc==1.42.0 in /usr/local/lib/python3.12/dist-packages (from opentelemetry-exporter-otlp>=1.34.0->pyannote.audio) (1.42.0)
Requirement already satisfied: opentelemetry-exporter-otlp-proto-http==1.42.0 in /usr/local/lib/python3.12/dist-packages (from opentelemetry-exporter-otlp>=1.34.0->pyannote.audio) (1.42.0)
Requirement already satisfied: googleapis-common-protos~=1.57 in /usr/local/lib/python3.12/dist-packages (from opentelemetry-exporter-otlp-proto-grpc==1.42.0->opentelemetry-exporter-otlp>=1.34.0->pyannote.audio) (1.74.0)
Requirement already satisfied: grpcio<2.0.0,>=1.63.2 in /usr/local/lib/python3.12/dist-packages (from opentelemetry-exporter-otlp-proto-grpc==1.42.0->opentelemetry-exporter-otlp>=1.34.0->pyannote.audio) (1.80.0)
Requirement already satisfied: opentelemetry-exporter-otlp-proto-common==1.42.0 in /usr/local/lib/python3.12/dist-packages (from opentelemetry-exporter-otlp-proto-grpc==1.42.0->opentelemetry-exporter-otlp>=1.34.0->pyannote.audio) (1.42.0)
Requirement already satisfied: opentelemetry-proto==1.42.0 in /usr/local/lib/python3.12/dist-packages (from opentelemetry-exporter-otlp-proto-grpc==1.42.0->opentelemetry-exporter-otlp>=1.34.0->pyannote.audio) (1.42.0)
Requirement already satisfied: requests~=2.7 in /usr/local/lib/python3.12/dist-packages (from opentelemetry-exporter-otlp-proto-http==1.42.0->opentelemetry-exporter-otlp>=1.34.0->pyannote.audio) (2.32.4)
Requirement already satisfied: protobuf<7.0,>=5.0 in /usr/local/lib/python3.12/dist-packages (from opentelemetry-proto==1.42.0->opentelemetry-exporter-otlp-proto-grpc==1.42.0->opentelemetry-exporter-otlp>=1.34.0->pyannote.audio) (5.29.6)
Requirement already satisfied: opentelemetry-semantic-conventions==0.63b0 in /usr/local/lib/python3.12/dist-packages (from opentelemetry-sdk>=1.34.0->pyannote.audio) (0.63b0)
Requirement already satisfied: pandas>=2.2.3 in /usr/local/lib/python3.12/dist-packages (from pyannote-core>=6.0.1->pyannote.audio) (3.0.3)
Requirement already satisfied: sortedcontainers>=2.4.0 in /usr/local/lib/python3.12/dist-packages (from pyannote-core>=6.0.1->pyannote.audio) (2.4.0)
Requirement already satisfied: scikit-learn>=1.6.1 in /usr/local/lib/python3.12/dist-packages (from pyannote-metrics>=4.0.0->pyannote.audio) (1.6.1)
Requirement already satisfied: optuna>=4.2.0 in /usr/local/lib/python3.12/dist-packages (from pyannote-pipeline>=4.0.0->pyannote.audio) (4.8.0)
Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.12/dist-packages (from rich>=13.9.4->pyannote.audio) (4.0.0)
Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.12/dist-packages (from rich>=13.9.4->pyannote.audio) (2.20.0)
Requirement already satisfied: setuptools in /usr/local/lib/python3.12/dist-packages (from torch>=2.8.0->pyannote.audio) (75.2.0)
Requirement already satisfied: sympy>=1.13.3 in /usr/local/lib/python3.12/dist-packages (from torch>=2.8.0->pyannote.audio) (1.14.0)
Requirement already satisfied: networkx>=2.5.1 in /usr/local/lib/python3.12/dist-packages (from torch>=2.8.0->pyannote.audio) (3.6.1)
Requirement already satisfied: jinja2 in /usr/local/lib/python3.12/dist-packages (from torch>=2.8.0->pyannote.audio) (3.1.6)
Requirement already satisfied: cuda-bindings==12.9.4 in /usr/local/lib/python3.12/dist-packages (from torch>=2.8.0->pyannote.audio) (12.9.4)
Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.8.93 in /usr/local/lib/python3.12/dist-packages (from torch>=2.8.0->pyannote.audio) (12.8.93)
Requirement already satisfied: nvidia-cuda-runtime-cu12==12.8.90 in /usr/local/lib/python3.12/dist-packages (from torch>=2.8.0->pyannote.audio) (12.8.90)
Requirement already satisfied: nvidia-cuda-cupti-cu12==12.8.90 in /usr/local/lib/python3.12/dist-packages (from torch>=2.8.0->pyannote.audio) (12.8.90)
Requirement already satisfied: nvidia-cudnn-cu12==9.10.2.21 in /usr/local/lib/python3.12/dist-packages (from torch>=2.8.0->pyannote.audio) (9.10.2.21)
Requirement already satisfied: nvidia-cublas-cu12==12.8.4.1 in /usr/local/lib/python3.12/dist-packages (from torch>=2.8.0->pyannote.audio) (12.8.4.1)
Requirement already satisfied: nvidia-cufft-cu12==11.3.3.83 in /usr/local/lib/python3.12/dist-packages (from torch>=2.8.0->pyannote.audio) (11.3.3.83)
Requirement already satisfied: nvidia-curand-cu12==10.3.9.90 in /usr/local/lib/python3.12/dist-packages (from torch>=2.8.0->pyannote.audio) (10.3.9.90)
Requirement already satisfied: nvidia-cusolver-cu12==11.7.3.90 in /usr/local/lib/python3.12/dist-packages (from torch>=2.8.0->pyannote.audio) (11.7.3.90)
Requirement already satisfied: nvidia-cusparse-cu12==12.5.8.93 in /usr/local/lib/python3.12/dist-packages (from torch>=2.8.0->pyannote.audio) (12.5.8.93)
Requirement already satisfied: nvidia-cusparselt-cu12==0.7.1 in /usr/local/lib/python3.12/dist-packages (from torch>=2.8.0->pyannote.audio) (0.7.1)
Requirement already satisfied: nvidia-nccl-cu12==2.27.5 in /usr/local/lib/python3.12/dist-packages (from torch>=2.8.0->pyannote.audio) (2.27.5)
Requirement already satisfied: nvidia-nvshmem-cu12==3.4.5 in /usr/local/lib/python3.12/dist-packages (from torch>=2.8.0->pyannote.audio) (3.4.5)
Requirement already satisfied: nvidia-nvtx-cu12==12.8.90 in /usr/local/lib/python3.12/dist-packages (from torch>=2.8.0->pyannote.audio) (12.8.90)
Requirement already satisfied: nvidia-nvjitlink-cu12==12.8.93 in /usr/local/lib/python3.12/dist-packages (from torch>=2.8.0->pyannote.audio) (12.8.93)
Requirement already satisfied: nvidia-cufile-cu12==1.13.1.3 in /usr/local/lib/python3.12/dist-packages (from torch>=2.8.0->pyannote.audio) (1.13.1.3)
Requirement already satisfied: cuda-pathfinder~=1.1 in /usr/local/lib/python3.12/dist-packages (from cuda-bindings==12.9.4->torch>=2.8.0->pyannote.audio) (1.5.3)
Requirement already satisfied: julius<0.3,>=0.2.3 in /usr/local/lib/python3.12/dist-packages (from torch-audiomentations>=0.12.0->pyannote.audio) (0.2.7)
Requirement already satisfied: torch-pitch-shift>=1.2.2 in /usr/local/lib/python3.12/dist-packages (from torch-audiomentations>=0.12.0->pyannote.audio) (1.2.5)
Requirement already satisfied: llvmlite<0.48,>=0.47.0dev0 in /usr/local/lib/python3.12/dist-packages (from numba->openai-whisper) (0.47.0)
Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.12/dist-packages (from tiktoken->openai-whisper) (2025.11.3)
Requirement already satisfied: aiohttp!=4.0.0a0,!=4.0.0a1 in /usr/local/lib/python3.12/dist-packages (from fsspec[http]<2028.0,>=2022.5.0->lightning>=2.4->pyannote.audio) (3.13.5)
Requirement already satisfied: anyio in /usr/local/lib/python3.12/dist-packages (from httpx<1,>=0.23.0->huggingface-hub>=0.28.1->pyannote.audio) (4.13.0)
Requirement already satisfied: certifi in /usr/local/lib/python3.12/dist-packages (from httpx<1,>=0.23.0->huggingface-hub>=0.28.1->pyannote.audio) (2026.4.22)
Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.12/dist-packages (from httpx<1,>=0.23.0->huggingface-hub>=0.28.1->pyannote.audio) (1.0.9)
Requirement already satisfied: idna in /usr/local/lib/python3.12/dist-packages (from httpx<1,>=0.23.0->huggingface-hub>=0.28.1->pyannote.audio) (3.13)
Requirement already satisfied: h11>=0.16 in /usr/local/lib/python3.12/dist-packages (from httpcore==1.*->httpx<1,>=0.23.0->huggingface-hub>=0.28.1->pyannote.audio) (0.16.0)
Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.12/dist-packages (from markdown-it-py>=2.2.0->rich>=13.9.4->pyannote.audio) (0.1.2)
Requirement already satisfied: alembic>=1.5.0 in /usr/local/lib/python3.12/dist-packages (from optuna>=4.2.0->pyannote-pipeline>=4.0.0->pyannote.audio) (1.18.4)
Requirement already satisfied: colorlog in /usr/local/lib/python3.12/dist-packages (from optuna>=4.2.0->pyannote-pipeline>=4.0.0->pyannote.audio) (6.10.1)
Requirement already satisfied: sqlalchemy>=1.4.2 in /usr/local/lib/python3.12/dist-packages (from optuna>=4.2.0->pyannote-pipeline>=4.0.0->pyannote.audio) (2.0.49)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.7->matplotlib>=3.10.0->pyannote.audio) (1.17.0)
Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests~=2.7->opentelemetry-exporter-otlp-proto-http==1.42.0->opentelemetry-exporter-otlp>=1.34.0->pyannote.audio) (3.4.7)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests~=2.7->opentelemetry-exporter-otlp-proto-http==1.42.0->opentelemetry-exporter-otlp>=1.34.0->pyannote.audio) (2.5.0)
Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn>=1.6.1->pyannote-metrics>=4.0.0->pyannote.audio) (1.5.3)
Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn>=1.6.1->pyannote-metrics>=4.0.0->pyannote.audio) (3.6.0)
Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.12/dist-packages (from sympy>=1.13.3->torch>=2.8.0->pyannote.audio) (1.3.0)
Requirement already satisfied: primePy>=1.3 in /usr/local/lib/python3.12/dist-packages (from torch-pitch-shift>=1.2.2->torch-audiomentations>=0.12.0->pyannote.audio) (1.3)
Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.12/dist-packages (from jinja2->torch>=2.8.0->pyannote.audio) (3.0.3)
Requirement already satisfied: click>=8.2.1 in /usr/local/lib/python3.12/dist-packages (from typer->huggingface-hub>=0.28.1->pyannote.audio) (8.3.3)
Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.12/dist-packages (from typer->huggingface-hub>=0.28.1->pyannote.audio) (1.5.4)
Requirement already satisfied: annotated-doc>=0.0.2 in /usr/local/lib/python3.12/dist-packages (from typer->huggingface-hub>=0.28.1->pyannote.audio) (0.0.4)
Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2028.0,>=2022.5.0->lightning>=2.4->pyannote.audio) (2.6.1)
Requirement already satisfied: aiosignal>=1.4.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2028.0,>=2022.5.0->lightning>=2.4->pyannote.audio) (1.4.0)
Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2028.0,>=2022.5.0->lightning>=2.4->pyannote.audio) (26.1.0)
Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2028.0,>=2022.5.0->lightning>=2.4->pyannote.audio) (1.8.0)
Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2028.0,>=2022.5.0->lightning>=2.4->pyannote.audio) (6.7.1)
Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2028.0,>=2022.5.0->lightning>=2.4->pyannote.audio) (0.4.1)
Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2028.0,>=2022.5.0->lightning>=2.4->pyannote.audio) (1.23.0)
Requirement already satisfied: Mako in /usr/local/lib/python3.12/dist-packages (from alembic>=1.5.0->optuna>=4.2.0->pyannote-pipeline>=4.0.0->pyannote.audio) (1.3.11)
Requirement already satisfied: greenlet>=1 in /usr/local/lib/python3.12/dist-packages (from sqlalchemy>=1.4.2->optuna>=4.2.0->pyannote-pipeline>=4.0.0->pyannote.audio) (3.4.0)

import os
import numpy as np
import torch
import av
from math import gcd
from scipy.signal import resample_poly
import whisper
from pyannote.audio import Pipeline
from pyannote.audio.pipelines.utils.hook import ProgressHook

from google.colab import drive
drive.mount('/content/drive')
DATA_DIR = "/content/drive/MyDrive/transcription_tutorial"  # adjust path as needed
VIDEO_ID  = "ThN7CkeEXXk"
webm_path = f"{DATA_DIR}/{VIDEO_ID}.webm"

if not os.path.exists(webm_path):
    import yt_dlp
    ydl_opts = {
        "format": "bestaudio",
        "outtmpl": f"{DATA_DIR}/{VIDEO_ID}.%(ext)s",
        "noplaylist": True,
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([f"https://www.youtube.com/watch?v={VIDEO_ID}"])
else:
    print("Audio already downloaded.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[youtube] Extracting URL: https://www.youtube.com/watch?v=ThN7CkeEXXk
[youtube] ThN7CkeEXXk: Downloading webpage

WARNING: [youtube] No supported JavaScript runtime could be found. Only deno is enabled by default; to use another runtime add  --js-runtimes RUNTIME[:PATH]  to your command/config. YouTube extraction without a JS runtime has been deprecated, and some formats may be missing. See  https://github.com/yt-dlp/yt-dlp/wiki/EJS  for details on installing one

[youtube] ThN7CkeEXXk: Downloading android vr player API JSON

WARNING: [youtube] No title found in player responses; falling back to title from initial data. Other metadata may also be missing
ERROR: [youtube] ThN7CkeEXXk: Sign in to confirm you’re not a bot. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies

---------------------------------------------------------------------------
ExtractorError                            Traceback (most recent call last)
/usr/local/lib/python3.12/dist-packages/yt_dlp/YoutubeDL.py in wrapper(self, *args, **kwargs)
   1697                 try:
-> 1698                     return func(self, *args, **kwargs)
   1699                 except (CookieLoadError, DownloadCancelled, LazyList.IndexError, PagedList.IndexError):

/usr/local/lib/python3.12/dist-packages/yt_dlp/YoutubeDL.py in __extract_info(self, url, ie, download, extra_info, process)
   1832         try:
-> 1833             ie_result = ie.extract(url)
   1834         except UserNotLive as e:

/usr/local/lib/python3.12/dist-packages/yt_dlp/extractor/common.py in extract(self, url)
    764                         url if self.get_param('verbose') else truncate_string(url, 100, 20)))
--> 765                     ie_result = self._real_extract(url)
    766                     if ie_result is None:

/usr/local/lib/python3.12/dist-packages/yt_dlp/extractor/youtube/_video.py in _real_extract(self, url)
   4060                     )
-> 4061                 self.raise_no_formats(reason, expected=True)
   4062 

/usr/local/lib/python3.12/dist-packages/yt_dlp/extractor/common.py in raise_no_formats(self, msg, expected, video_id)
   1276         else:
-> 1277             raise ExtractorError(msg, expected=expected, video_id=video_id)
   1278 

ExtractorError: [youtube] ThN7CkeEXXk: Sign in to confirm you’re not a bot. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies

During handling of the above exception, another exception occurred:

DownloadError                             Traceback (most recent call last)
/tmp/ipykernel_5529/3984858366.py in <cell line: 0>()
     13     }
     14     with yt_dlp.YoutubeDL(ydl_opts) as ydl:
---> 15         ydl.download([f"https://www.youtube.com/watch?v={VIDEO_ID}"])
     16 else:
     17     print("Audio already downloaded.")

/usr/local/lib/python3.12/dist-packages/yt_dlp/YoutubeDL.py in download(self, url_list)
   3668 
   3669         for url in url_list:
-> 3670             self.__download_wrapper(self.extract_info)(
   3671                 url, force_generic_extractor=self.params.get('force_generic_extractor', False))
   3672 

/usr/local/lib/python3.12/dist-packages/yt_dlp/YoutubeDL.py in wrapper(*args, **kwargs)
   3641         def wrapper(*args, **kwargs):
   3642             try:
-> 3643                 res = func(*args, **kwargs)
   3644             except CookieLoadError:
   3645                 raise

/usr/local/lib/python3.12/dist-packages/yt_dlp/YoutubeDL.py in extract_info(self, url, download, ie_key, extra_info, process, force_generic_extractor)
   1685                     raise ExistingVideoReached
   1686                 break
-> 1687             return self.__extract_info(url, self.get_info_extractor(key), download, extra_info, process)
   1688         else:
   1689             extractors_restricted = self.params.get('allowed_extractors') not in (None, ['default'])

/usr/local/lib/python3.12/dist-packages/yt_dlp/YoutubeDL.py in wrapper(self, *args, **kwargs)
   1714                     self.report_error(msg)
   1715                 except ExtractorError as e:  # An error we somewhat expected
-> 1716                     self.report_error(str(e), e.format_traceback())
   1717                 except Exception as e:
   1718                     if self.params.get('ignoreerrors'):

/usr/local/lib/python3.12/dist-packages/yt_dlp/YoutubeDL.py in report_error(self, message, *args, **kwargs)
   1152         in red if stderr is a tty file.
   1153         """
-> 1154         self.trouble(f'{self._format_err("ERROR:", self.Styles.ERROR)} {message}', *args, **kwargs)
   1155 
   1156     def write_debug(self, message, only_once=False):

/usr/local/lib/python3.12/dist-packages/yt_dlp/YoutubeDL.py in trouble(self, message, tb, is_error)
   1091             else:
   1092                 exc_info = sys.exc_info()
-> 1093             raise DownloadError(message, exc_info)
   1094         self._download_retcode = 1
   1095 

DownloadError: ERROR: [youtube] ThN7CkeEXXk: Sign in to confirm you’re not a bot. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies

container = av.open(webm_path)
audio_stream = container.streams.audio[0]

# Decode every frame and stack them along the time axis
frames = [frame.to_ndarray() for frame in container.decode(audio_stream)]
waveform = torch.tensor(np.concatenate(frames, axis=1), dtype=torch.float32)

audio_input = {"waveform": waveform, "sample_rate": audio_stream.sample_rate}
container.close()

print(f"Waveform shape: {waveform.shape}  (channels × samples)")
print(f"Sample rate:    {audio_stream.sample_rate} Hz")
print(f"Duration:       {waveform.shape[1] / audio_stream.sample_rate:.1f} s")

# Step 1: Convert waveform tensor to a NumPy array
audio_np = waveform.numpy()
print(f"Original shape: {audio_np.shape}  (channels × samples)")

# Step 2: Mix to mono
if audio_np.shape[0] > 1:
    audio_np = audio_np.mean(axis=0)   # average across channels
else:
    audio_np = audio_np[0]
print(f"Mono shape:     {audio_np.shape}")

# Step 3: Resample to 16 kHz if needed
src_rate = audio_stream.sample_rate
target_rate = 16000
if src_rate != target_rate:
    g = gcd(src_rate, target_rate)
    audio_np = resample_poly(audio_np, target_rate // g, src_rate // g)
    print(f"Resampled from {src_rate} Hz \u2192 {target_rate} Hz")

audio_np = audio_np.astype(np.float32)
print(f"Final shape:    {audio_np.shape}")

# Load Whisper model and transcribe
# verbose=True prints each segment as it's decoded
model = whisper.load_model("medium")

# fp16=True is faster but requires an NVIDIA GPU; use fp16=False on CPU
result = model.transcribe(audio_np, task="transcribe", language="en", fp16=False, verbose=True)

for seg in result["segments"]:
    print(
        f"{seg['start']:6.1f}s | "
        f"no_speech={seg['no_speech_prob']:.2f} | "
        f"logprob={seg['avg_logprob']:.2f} | "
        #f"compression={seg['compression_ratio']:.2f} | "
        #f"temp={seg['temperature']} | "
        f"{seg['text'].strip()}"
    )

result["segments"][10]

pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-community-1",
    token=os.environ["HF_TOKEN"]
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipeline.to(device)
print(f"Pipeline running on: {device}")

with ProgressHook() as hook:
    output = pipeline(audio_input, hook=hook)

print(f"Type of output: {type(output)}")
print(f"Attributes of output: {dir(output)}")

print("\nDiarization output:")
for turn, _, speaker in output.speaker_diarization.itertracks(yield_label=True):
    print(f"  {speaker}  {turn.start:6.1f}s → {turn.end:6.1f}s")

def get_speaker(start, end, diarization_output):
      """Return the speaker with the most overlap with the time window [start, end]."""
      max_overlap = 0.0
      best_speaker = None

      for turn, _, speaker in diarization_output.speaker_diarization.itertracks(yield_label=True):
          overlap = min(turn.end, end) - max(turn.start, start)
          if overlap > max_overlap:
              max_overlap = overlap
              best_speaker = speaker

      return best_speaker if max_overlap > 0 else "UNKNOWN"

# Print the labeled transcript
print("Labeled transcript:\n")
for seg in result["segments"]:
    speaker = get_speaker(seg["start"], seg["end"], output)
    print(f"[{speaker}] {seg['start']:6.1f}s: {seg['text'].strip()}")

import pandas as pd

rows = []
current_speaker = None
current_text = []
current_start = None

for seg in result["segments"]:
    speaker = get_speaker(seg["start"], seg["end"], output)

    if speaker == current_speaker:
        current_text.append(seg["text"].strip())
    else:
        # If there was a previous speaker, save their combined segment
        if current_speaker is not None:
            rows.append({
                "speaker": current_speaker,
                "start": current_start,
                "end": seg["start"], # End at the start of the new speaker's segment
                "text": " ".join(current_text),
            })

        # Start a new segment for the new speaker
        current_speaker = speaker
        current_start = seg["start"]
        current_text = [seg["text"].strip()]

# Add the last accumulated segment
if current_speaker is not None:
    # Use the end time of the very last segment for the final entry
    last_seg_end = result["segments"][-1]["end"]
    rows.append({
        "speaker": current_speaker,
        "start": current_start,
        "end": last_seg_end,
        "text": " ".join(current_text),
    })

df = pd.DataFrame(rows)
df.head(10)

out_path = f"{DATA_DIR}/npr_diarized.csv"
df.to_csv(out_path, index=False)
print(f"Saved to {out_path}")

result_auto = model.transcribe(audio_np, task="transcribe", language=None, fp16=False, verbose=False)
print(f"Detected language: {result_auto['language']}")

wav_path = f"{DATA_DIR}/chunk_0.wav"
result_wav = model.transcribe(wav_path, task="transcribe", language=None, fp16=False, verbose=True)
print(f"\nDetected language: {result_wav['language']}")

wav_path = f"{DATA_DIR}/chunk_0.wav"
result_wav = model.transcribe(wav_path, task="translate", language="es", fp16=False, verbose=True)

Model	Parameters	VRAM	Speed (vs. large)
tiny	39 M	~1 GB	~10×
base	74 M	~1 GB	~7×
small	244 M	~2 GB	~4×
medium	769 M	~5 GB	~2×
large	1550 M	~10 GB	1×
turbo	809 M	~6 GB	~8×

Audio to Text Transcription Tutorial¶

What is Whisper?¶

Why do we need to do this in Python?¶

1. Install dependencies¶

2. Imports¶

3. Download test audio¶

4. Load audio into a tensor¶

5. Transcribe with Whisper¶

6. Load the pyannote diarization pipeline¶

7. Run speaker diarization¶

8. Assign speakers to transcript segments¶

9. Export to a DataFrame¶

Notes and limitations¶

Extra: Language Auto-Detection and Multilingual Research¶

Extra: Transcribe and translate a WAV file with language auto-detection¶

Audio to Text Transcription Tutorial¶

Use Cases in Computational Social Science¶

What is Whisper?¶

Why do we need to do this in Python?¶

1. Install dependencies¶

2. Imports¶

3. Download test audio¶

4. Load audio into a tensor¶

5. Transcribe with Whisper¶

6. Load the pyannote diarization pipeline¶

7. Run speaker diarization¶

8. Assign speakers to transcript segments¶

9. Export to a DataFrame¶

Notes and limitations¶

Extra: Language Auto-Detection and Multilingual Research¶

Extra: Transcribe and translate a WAV file with language auto-detection¶