Correct Audio Duration Discrepancies with Multi-Tool Validation and Transcoding | AssemblyAI

In this guide, you’ll learn how to check the audio duration of a file using three different tools: ffprobe, SoX, and MediaInfo. This guide was created in response to customer feedback about transcription results showing incorrect audio durations. The issue was traced to audio files with corrupted metadata or problematic headers, leading to inaccurate duration data. If these tools report differing durations for the same file, transcription inconsistencies can arise. We will programmatically detect any duration mismatches and transcode the file to resolve them, typically resulting in a more accurate transcription.

Quickstart

1 import assemblyai as aai
2 import subprocess
3 
4 aai.settings.api_key = "YOUR_API_KEY"
5 config = aai.TranscriptionConfig(speech_models=["universal-3-pro", "universal-2"])
6 transcriber = aai.Transcriber()
7 
8 def get_duration_ffprobe(file_path):
9     command = [
10         'ffprobe', '-v', 'error', '-show_entries',
11         'format=duration', '-of',
12         'default=noprint_wrappers=1:nokey=1', file_path
13     ]
14     try:
15         duration = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
16         return float(duration.stdout.strip())
17     except ValueError:
18         print("Error: Unable to parse duration from ffprobe output.")
19         return None
20 
21 def get_duration_sox(file_path):
22     command = ['soxi', '-D', file_path]
23     try:
24         duration = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
25         return float(duration.stdout.strip())
26     except ValueError:
27         print("Error: Unable to parse duration from SoX output.")
28         return None
29 
30 def get_duration_mediainfo(file_path):
31     command = ['mediainfo', '--Output=General;%Duration%', file_path]
32     try:
33         duration_ms = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
34         duration_str = duration_ms.stdout.strip()
35         # Check if the output is empty or not a valid number
36         if duration_str:
37             return float(duration_str) / 1000
38         else:
39             print("Error: MediaInfo returned empty or invalid duration")
40             return None
41     except ValueError:
42         print("Error: Unable to parse duration from MediaInfo output.")
43         return None
44 
45 def check_audio_durations(file_path):
46     #Check if audio durations differ among the three tools.
47     ffprobe_duration = get_duration_ffprobe(file_path)
48     sox_duration = get_duration_sox(file_path)
49     mediainfo_duration = get_duration_mediainfo(file_path)
50 
51     # Print all retrieved durations
52     print(f"ffprobe duration: {ffprobe_duration:.6f} seconds" if ffprobe_duration is not None else "ffprobe duration: Error retrieving duration")
53     print(f"SoX duration: {sox_duration:.6f} seconds" if sox_duration is not None else "SoX duration: Error retrieving duration")
54     print(f"MediaInfo duration: {mediainfo_duration:.6f} seconds" if mediainfo_duration is not None else "MediaInfo duration: Error retrieving duration")
55 
56     # Return durations for further checks
57     return (ffprobe_duration, sox_duration, mediainfo_duration)
58 
59 def transcribe(file):
60     print("Executing transcription as audio durations are consistent.")
61     transcript = transcriber.transcribe(file, config)
62     print(transcript.text)
63 
64 def transcode(input_file, output_file):
65     #Transcode audio file to a 16kHz WAV file.
66     print(f"Transcoding file {input_file} to {output_file}...")
67     command = [
68         'ffmpeg', '-i', input_file, '-ar', '16000', '-ac', '1', output_file
69     ]
70     try:
71         subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
72         if os.path.exists(output_file):
73             print(f"Transcoding complete. Output file: {output_file}")
74         else:
75             print("Error: Transcoding failed.")
76     except subprocess.CalledProcessError as e:
77         print("Warnings from ffmpeg")
78         """Print errors or warnings from ffmpeg"""
79         #print(e.stderr.decode())
80 
81 def durations_are_consistent(durations, tolerance=0.01):
82     #Check if durations are consistent within a given tolerance of 0.01 seconds.
83     if None in durations:
84         return False
85     min_duration = min(durations)
86     max_duration = max(durations)
87     return (max_duration - min_duration) <= tolerance
88 
89 def main(file_path):
90     durations = check_audio_durations(file_path)
91 
92     if durations:
93         if None in durations:
94             print("Error: One or more duration values could not be retrieved.")
95             transcoded_file = file_path.rsplit('.', 1)[0] + '_transcoded.wav'
96             transcode(file_path, transcoded_file)
97             new_durations = check_audio_durations(transcoded_file)
98             if new_durations and durations_are_consistent(new_durations):
99                 transcribe(transcoded_file)
100             else:
101                 print("Warning: The audio durations still differ or an error occurred with the transcoded file.")
102         elif not durations_are_consistent(durations):
103             print("Warning: The audio durations differ between tools.")
104             transcoded_file = file_path.rsplit('.', 1)[0] + '_transcoded.wav'
105             transcode(file_path, transcoded_file)
106             new_durations = check_audio_durations(transcoded_file)
107             if new_durations and durations_are_consistent(new_durations):
108                 transcribe(transcoded_file)
109             else:
110                 print("Warning: The audio durations still differ or an error occurred with the transcoded file.")
111         else:
112             print("The audio durations are consistent.")
113             transcribe(file_path)
114 
115 audio_file="./audio.mp4"
116 
117 if __name__ == "__main__":
118     file_path = f"{audio_file}"
119     main(file_path)

Get Started

Before we begin, make sure you have an AssemblyAI account and an API key. You can sign up for an AssemblyAI account and get your API key from your dashboard.

Step-by-Step Instructions

Install the SDK:

$ pip install assemblyai

Import the assemblyai package along with subprocess, set your AssemblyAI API key, and initiate the transcriber.

1 import assemblyai as aai
2 import subprocess
3 
4 aai.settings.api_key = "YOUR_API_KEY"
5 config = aai.TranscriptionConfig(speech_models=["universal-3-pro", "universal-2"])
6 transcriber = aai.Transcriber()

For this cookbook you will need ffmpeg, sox, and MediaInfo. We will use these tools to pull the duration from the audio. Matching audio duration is crucial because discrepancies may indicate issues with the audio file’s metadata or headers. Such inconsistencies can lead to inaccurate transcription results, playback issues, or unexpected behaviour in media applications. By verifying that the duration is consistent across all three tools, we can detect potential problems early and correct any corrupted metadata or faulty headers before processing the audio further.

First, we will get the audio duration using ffprobe.

1 def get_duration_ffprobe(file_path):
2     command = [
3         'ffprobe', '-v', 'error', '-show_entries',
4         'format=duration', '-of',
5         'default=noprint_wrappers=1:nokey=1', file_path
6     ]
7     try:
8         duration = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
9         return float(duration.stdout.strip())
10     except ValueError:
11         print("Error: Unable to parse duration from ffprobe output.")
12         return None

Next, we will get the audio duration for the same file using sox.

1 def get_duration_sox(file_path):
2     command = ['soxi', '-D', file_path]
3     try:
4         duration = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
5         return float(duration.stdout.strip())
6     except ValueError:
7         print("Error: Unable to parse duration from SoX output.")
8         return None

Finally, we will get the audio duration for the same file using MediaInfo.

1 def get_duration_mediainfo(file_path):
2     command = ['mediainfo', '--Output=General;%Duration%', file_path]
3     try:
4         duration_ms = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
5         duration_str = duration_ms.stdout.strip()
6         # Check if the output is empty or not a valid number
7         if duration_str:
8             return float(duration_str) / 1000
9         else:
10             print("Error: MediaInfo returned empty or invalid duration")
11             return None
12     except ValueError:
13         print("Error: Unable to parse duration from MediaInfo output.")
14         return None

The following function will return the durations from the three tools and convert them to the same format.

1 def check_audio_durations(file_path):
2     #Check if audio durations differ among the three tools.
3     ffprobe_duration = get_duration_ffprobe(file_path)
4     sox_duration = get_duration_sox(file_path)
5     mediainfo_duration = get_duration_mediainfo(file_path)
6 
7     # Print all retrieved durations
8     print(f"ffprobe duration: {ffprobe_duration:.6f} seconds" if ffprobe_duration is not None else "ffprobe duration: Error retrieving duration")
9     print(f"SoX duration: {sox_duration:.6f} seconds" if sox_duration is not None else "SoX duration: Error retrieving duration")
10     print(f"MediaInfo duration: {mediainfo_duration:.6f} seconds" if mediainfo_duration is not None else "MediaInfo duration: Error retrieving duration")
11 
12     # Return durations for further checks
13     return (ffprobe_duration, sox_duration, mediainfo_duration)

Define the transcribe function. This will run only when the duration is consistent among the three tools.

1 def transcribe(file):
2     print("Executing transcription as audio durations are consistent.")
3     transcript = transcriber.transcribe(file, config)
4     print(transcript.text)

Define the transcode function. We will run this if one or more durations differ. The output file will be a 16kHz WAV file as that is the format AssemblyAI models are trained on. When running the ffmpeg command, the transcode may fail or return warnings if there are issues with the input file’s format, corrupted metadata, or unsupported codecs. These warnings tend to be verbose but you can print them for troubleshooting.

1 def transcode(input_file, output_file):
2     #Transcode audio file to a 16kHz WAV file.
3     print(f"Transcoding file {input_file} to {output_file}...")
4     command = [
5         'ffmpeg', '-i', input_file, '-ar', '16000', '-ac', '1', output_file
6     ]
7     try:
8         subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
9         if os.path.exists(output_file):
10             print(f"Transcoding complete. Output file: {output_file}")
11         else:
12             print("Error: Transcoding failed.")
13     except subprocess.CalledProcessError as e:
14         print("Warnings from ffmpeg")
15         """Print errors or warnings from ffmpeg"""
16         #print(e.stderr.decode())

Define a function that will check if the durations are consistent. There may be small differences so it’s best to allow a small tolerance. In this example the tolerance value will be 0.01 seconds.

1 def durations_are_consistent(durations, tolerance=0.01):
2     #Check if durations are consistent within a given tolerance of 0.01 seconds.
3     if None in durations:
4         return False
5     min_duration = min(durations)
6     max_duration = max(durations)
7     return (max_duration - min_duration) <= tolerance

Finally, here is the order of operations for this program. This program will first check the duration of an audio file across different tools to ensure consistency. If any tool fails to retrieve a duration or if the durations differ, it transcodes the audio to a new 16kHz WAV file and checks the duration of the WAV file. If the durations are consistent in the transcoded file, the program proceeds to transcribe it. If inconsistencies remain after transcoding, it logs a warning to highlight the issue and will not transcribe the file.

1 def main(file_path):
2     durations = check_audio_durations(file_path)
3 
4     if durations:
5         if None in durations:
6             print("Error: One or more duration values could not be retrieved.")
7             transcoded_file = file_path.rsplit('.', 1)[0] + '_transcoded.wav'
8             transcode(file_path, transcoded_file)
9             new_durations = check_audio_durations(transcoded_file)
10             if new_durations and durations_are_consistent(new_durations):
11                 transcribe(transcoded_file)
12             else:
13                 print("Warning: The audio durations still differ or an error occurred with the transcoded file.")
14         elif not durations_are_consistent(durations):
15             print("Warning: The audio durations differ between tools.")
16             transcoded_file = file_path.rsplit('.', 1)[0] + '_transcoded.wav'
17             transcode(file_path, transcoded_file)
18             new_durations = check_audio_durations(transcoded_file)
19             if new_durations and durations_are_consistent(new_durations):
20                 transcribe(transcoded_file)
21             else:
22                 print("Warning: The audio durations still differ or an error occurred with the transcoded file.")
23         else:
24             print("The audio durations are consistent.")
25             transcribe(file_path)
26 
27 audio_file="./audio/8950.mp4"
28 
29 if __name__ == "__main__":
30     file_path = f"{audio_file}"
31     main(file_path)

If you continue to experience unexpected behaviour with your file, please contact our support team at support@assemblyai.com for assistance in diagnosing the issue.