Source code for VuVoPy.data.utils.vuvs_gmm

import numpy as np
from sklearn.mixture import GaussianMixture
from scipy.stats import mode

[docs] def vuvs_gmm(segments, sr, winover, smoothing_window=5): """ Classifies audio frames into voiced, unvoiced, or silence using Gaussian Mixture Models (GMMs) and applies smoothing and post-processing rules to refine the classification. Parameters: segments (numpy.ndarray): A 2D array of audio frames with shape (num_frames, frame_length). sr (int): Sampling rate of the audio signal in Hz. winover (int): Overlap between consecutive frames in samples. smoothing_window (int, optional): Window size for smoothing the classification labels. Defaults to 5. Returns: numpy.ndarray: An array of labels for each frame, where: 0 = silence, 1 = unvoiced, 2 = voiced. Notes: - The function extracts features such as energy, high-to-low frequency ratio, normalized autocorrelation coefficient, and zero-crossing rate for each frame. - Two GMMs are used: the first separates voiced frames from unvoiced/silence, and the second separates unvoiced from silence. - Smoothing is applied to reduce noise in the classification labels. - Post-processing rules are applied to handle short segments and ensure temporal consistency. """ features = [] frame_length = segments.shape[1] segments = segments.T # Transpose to iterate over frames for frame in segments: spectrum = np.abs(np.fft.rfft(frame, n=frame_length)) freqs = np.fft.rfftfreq(frame_length, 1 / sr) # E: Frame energy above 200 Hz mask = freqs > 200 E = 10 * np.log10(np.sum(spectrum[mask] ** 2) + 1e-10) # Ehi: High-frequency to low-frequency ratio mid = int(len(freqs) * 0.25) low_energy = np.sum(spectrum[:mid] ** 2) high_energy = np.sum(spectrum[mid:] ** 2) Ehl = 10 * np.log10(high_energy / (low_energy + 1e-10) + 1e-10) # C1: Normalized autocorrelation coefficient s_prev = frame[:-1] #if count > 0 else frame C1 = np.correlate(frame, s_prev[:frame_length])[0] / (np.sum(frame ** 2) + 1e-10) # Nz: Zero-crossing rate zcr =np.sum(np.diff(np.sign(frame)) != 0) features.append([E, 100 * C1, Ehl, zcr]) features = np.array(features) # Classify voiced/unvoiced/silence using GMM gmm1 = GaussianMixture(n_components=2, covariance_type='diag', random_state=0, max_iter=100) gmm1.fit(features) means = gmm1.means_ voiced_idx = np.argmax(means[:, 0]) # Higher energy => Voiced voiced_mask = gmm1.predict(features) == voiced_idx # Unvoiced vs Silence features_us = features[~voiced_mask] gmm2 = GaussianMixture(n_components=2, covariance_type='diag', random_state=0, max_iter=100) gmm2.fit(features_us) means2 = gmm2.means_ unvoiced_idx = np.argmax(means2[:, 0]) unvoiced_mask = gmm2.predict(features_us) == unvoiced_idx # Raw labels: 0 = silence, 1 = unvoiced, 2 = voiced labels = np.zeros(len(features), dtype=int) labels[voiced_mask] = 2 # Voiced labels[~voiced_mask] = np.where(unvoiced_mask, 1, 0) # Unvoiced #Step 1: Smoothing the labels half_win = smoothing_window // 2 padded = np.pad(labels, (half_win, half_win), mode='edge') smoothed_labels = np.array([ int(mode(padded[i:i + smoothing_window], keepdims=False).mode) for i in range(len(labels)) ]) # Step 2: Post-processing cleanup frame_duration = (frame_length-winover) / sr # seconds min_duration_frames = int(0.01 / frame_duration) # 10ms long_wait_frames = int(0.05 / frame_duration) # 90ms labels = smoothed_labels.copy() def relabel_short_segments(target_class, surrounding_class): i = 0 while i < len(labels): if labels[i] != target_class: i += 1 continue start = i while i < len(labels) and labels[i] == target_class: i += 1 length = i - start if length < min_duration_frames: before = labels[start - 1] if start > 0 else -1 after = labels[i] if i < len(labels) else -1 if before == after == surrounding_class: labels[start:i] = surrounding_class # Rule A: Short voiced segments between unvoiced → unvoiced relabel_short_segments(target_class=2, surrounding_class=1) # Rule B: Short unvoiced segments between voiced → voiced relabel_short_segments(target_class=1, surrounding_class=2) # Rule C: First unvoiced segment not followed by voiced in 90ms → silence # Find first UV segment for i in range(len(labels)): if labels[i] == 1: lookahead = labels[i:i + long_wait_frames] if 2 not in lookahead: labels[i:i + len(lookahead)] = 0 # Silence break return labels