Download Audio and Subtitle Files from YouTube
June 17, 2023
You can use the library yt_dlp
to download raw data from YouTube. Note, this requires ffmpeg
to be installed for video and audio processing.
The yt_dlp
library downloads data based on an options configuration given by the relevant request. In this case, we want to download the raw audio file and the raw subtitle file.
For the audio file options:
options = {
"format": "bestaudio/best",
"postprocessors": [
{
"key": "FFmpegExtractAudio",
"preferredcodec": "mp3",
"preferredquality": "192",
}
],
"outtmpl": "%(title)s.%(ext)s",
}
For the subtitle file options:
options = {
"skip_download": True,
"writeautomaticsub": True,
"subtitlesformat": "ass/srt/best",
"outtmpl": "%(title)s.%(ext)s",
}
To tie it together, create a simple helper function for the downloader and two helper functions for either option configuration:
def download_yt_file(url: str, options: dict):
"""
Download a youtube file, with the given options
"""
with YoutubeDL(options) as ydl:
ydl.download([url])
def create_audio_options(
output_directory: Optional[str] = None, output_file: Optional[str] = None
) -> dict:
"""
Create options to download audio file ONLY (no subtitles/video files), in mp3 format
CLI command: yt-dlp -x --audio-format mp3 -o "%(title)s.%(ext)s" --compat-options no-certifi -- {URL}
"""
if output_directory is None:
output_path = pathlib.Path(".")
else:
output_path = pathlib.Path(output_directory)
if not output_path.is_dir():
raise ValueError(f"Invalid directory: {output_directory}")
if output_file is None:
output_file = "%(title)s"
options = {
"nocheckcertificate": True,
"format": "bestaudio/best",
"postprocessors": [
{
"key": "FFmpegExtractAudio",
"preferredcodec": "mp3",
"preferredquality": "192",
}
],
"outtmpl": str(output_path / f"{output_file}.%(ext)s"),
}
return options
def create_subtitles_options(
output_directory: Optional[str] = None, output_file: Optional[str] = None
) -> dict:
"""
Create options to download subtitles file ONLY (no audio/video files), in vtt format
CLI command: yt-dlp --skip-download --write-auto-subs --sub-format ass/srt/best -o "%(title)s.%(ext)s" --compat-options no-certifi {}
"""
if output_directory is None:
output_path = pathlib.Path(".")
else:
output_path = pathlib.Path(output_directory)
if not output_path.is_dir():
raise ValueError(f"Invalid directory: {output_directory}")
if output_file is None:
output_file = "%(title)s"
options = {
"nocheckcertificate": True,
"skip_download": True,
"writeautomaticsub": True,
"subtitlesformat": "ass/srt/best",
"outtmpl": str(output_path / f"{output_file}.%(ext)s"),
}
return options
We can then download the relevant data:
url = "[https://www.youtube.com/watch?v=XiqN3pOIVIw"](https://www.youtube.com/watch?v=XiqN3pOIVIw")
# download audio file
options = create_audio_options(output_directory="data")
download_yt_file(url, options)
# download subtitles file
options = create_subtitles_options(output_directory="data")
download_yt_file(url, options)