Source code for VuVoPy.data.containers.voiced_sample

import numpy as np
import matplotlib.pyplot as plt
from VuVoPy.data.containers.prepocessing import Preprocessed as pp
from VuVoPy.data.containers.sample import VoiceSample as vs
from VuVoPy.data.containers.segmentation import Segmented as sg
from VuVoPy.data.utils.vuvs_detection import Vuvs as vuvs


[docs]
class VoicedSample(vs):
    """
    VoicedSample is a class that processes and analyzes preprocessed audio data to extract voiced samples 
    and remove silence from the waveform. It also provides functionality to stretch labels to match the 
    signal length.
    Attributes:
        x (numpy.ndarray): The original waveform extracted from the preprocessed data.
        x_preem (numpy.ndarray): The pre-emphasized version of the waveform.
        x_norm (numpy.ndarray): The normalized version of the waveform.
        fs (int): The sampling rate of the audio signal.
        vuvs (object): An object containing voiced/unvoiced labels for the audio signal.
        voiced_sample (numpy.ndarray): The waveform containing only voiced segments.
        silence_removed_sample (numpy.ndarray): The waveform with silence removed.
    Methods:
        get_waveform():
            Returns the silence-removed waveform as a NumPy array.
        label_stretch():
            Stretches the voiced/unvoiced labels to match the length of the audio signal.
        get_voiced_sample():
            Extracts and returns the voiced segments of the waveform based on the stretched labels.
        get_silence_remove_sample():
            Removes silence from the waveform based on the stretched labels and returns the resulting waveform.
        get_sampling_rate():
            Returns the sampling rate of the audio signal.
    """
    def __init__(self, preprocessed, vuvs, fs) :
        self.x = preprocessed.get_waveform()
        self.x_preem = preprocessed.get_preemphasis()
        self.x_norm = preprocessed.get_normalization()
        self.fs = preprocessed.get_sampling_rate()
        self.vuvs = vuvs

        self.voiced_sample = self.get_voiced_sample()
        self.silence_removed_sample = self.get_silence_remove_sample()


[docs]
    def get_waveform(self):
        """
        Return the silence removed waveform as a NumPy array.
        """
        return self.voiced_sample



[docs]
    def label_stretch(self):
        """
        Stretches or compresses a sequence of labels to match the length of a target array.
        This function takes a sequence of labels and adjusts their lengths proportionally 
        to match the length of the target array `self.x`. It ensures that the relative 
        proportions of the original label segments are preserved while fixing any rounding 
        errors to exactly match the target length.
        Returns:
            np.ndarray: A stretched or compressed array of labels with the same length as `self.x`.
        """

        labels = self.vuvs.get_vuvs()
        arr = np.asarray(labels)
        target_len = len(self.x)
        # Find segments where values stay the same
        segments = []
        start_idx = 0
        for i in range(1, len(arr)):
           if arr[i] != arr[i - 1]:
              segments.append(arr[start_idx:i])
              start_idx = i
        segments.append(arr[start_idx:])  # Add last segment
        original_lens = np.array([len(seg) for seg in segments])

        # Determine how many samples per segment
        total_original = np.sum(original_lens)
    
        # Calculate how much to stretch each segment
        stretched_lens = np.round((original_lens / total_original) * target_len).astype(int)

        # Fix rounding errors to exactly match target_len
        diff = target_len - np.sum(stretched_lens)
        while diff != 0:
            for i in range(len(stretched_lens)):
               if diff == 0:
                  break
            stretched_lens[i] += 1 if diff > 0 else -1
            diff = target_len - np.sum(stretched_lens)

        # Build the stretched array
        stretched = np.concatenate([np.full(l, seg[0]) for seg, l in zip(segments, stretched_lens)])
        return stretched

    

[docs]
    def get_voiced_sample(self):
        """
        Extracts and returns the voiced portion of the audio sample.
        This method uses the label information to identify the voiced segments
        in the audio sample. It assumes that the labels are generated such that
        a label value of 2 corresponds to voiced segments.
        Returns:
            numpy.ndarray: A subset of the audio sample containing only the 
            voiced segments.
        """
        
        sample = self.x
        labels = self.label_stretch()
        voiced_sample = sample[labels == 2]
        return voiced_sample



[docs]
    def get_silence_remove_sample(self):
        """
        Removes segments of silence from the audio sample based on the provided labels.
        This method identifies silent regions in the audio sample `self.x` using the 
        labels generated by the `label_stretch` method. Silent regions are defined as 
        consecutive frames labeled as 0, with a duration greater than or equal to 50 ms. 
        These regions are then removed from the audio sample.
        Returns:
            numpy.ndarray: A modified version of the audio sample `self.x` with silent 
            regions removed.
        """
        
        sample = self.x
        labels = self.label_stretch()
        
        i = 0
        min_frames = int(np.ceil(50 / 1000 * self.fs))
        silence_idx = []

        while i < len(labels):
            if labels[i] == 0:
                start = i
                while i < len(labels) and labels[i] == 0:
                    i += 1
                silence_len = i - start
                if silence_len >= min_frames:
                    silence_idx.append((start, i))
            else:
                i += 1
            mask = np.ones(len(self.x), dtype=bool)
            for start, end in silence_idx:
                mask[start:end] = False
        
        return self.x[mask]


[docs]
    def get_sampling_rate(self):
        """
        Return the sampling rate.
        """
        return self.fs