Source code for gnes.encoder.audio.vggish_cores.vggish_postprocess

# ==============================================================================

"""Post-process embeddings from VGGish."""

import numpy as np

from ..vggish_cores import vggish_params


[docs]class Postprocessor(object):
    """Post-processes VGGish embeddings.

    The initial release of AudioSet included 128-D VGGish embeddings for each
    segment of AudioSet. These released embeddings were produced by applying
    a PCA transformation (technically, a whitening transform is included as well)
    and 8-bit quantization to the raw embedding output from VGGish, in order to
    stay compatible with the YouTube-8M project which provides visual embeddings
    in the same format for a large set of YouTube videos. This class implements
    the same PCA (with whitening) and quantization transformations.
    """

    def __init__(self, pca_params_npz_path):
        """Constructs a postprocessor.

        Args:
          pca_params_npz_path: Path to a NumPy-format .npz file that
            contains the PCA parameters used in postprocessing.
        """
        params = np.load(pca_params_npz_path)
        self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME]
        # Load means into a column vector for easier broadcasting later.
        self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1)
        assert self._pca_matrix.shape == (
            vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), (
                'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,))
        assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), (
                'Bad PCA means shape: %r' % (self._pca_means.shape,))

[docs]    def postprocess(self, embeddings_batch):
        """Applies postprocessing to a batch of embeddings.

        Args:
          embeddings_batch: An nparray of shape [batch_size, embedding_size]
            containing output from the embedding layer of VGGish.

        Returns:
          An nparray of the same shape as the input but of type uint8,
          containing the PCA-transformed and quantized version of the input.
        """
        assert len(embeddings_batch.shape) == 2, (
                'Expected 2-d batch, got %r' % (embeddings_batch.shape,))
        assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, (
                'Bad batch shape: %r' % (embeddings_batch.shape,))

        # Apply PCA.
        # - Embeddings come in as [batch_size, embedding_size].
        # - Transpose to [embedding_size, batch_size].
        # - Subtract pca_means column vector from each column.
        # - Premultiply by PCA matrix of shape [output_dims, input_dims]
        #   where both are are equal to embedding_size in our case.
        # - Transpose result back to [batch_size, embedding_size].
        pca_applied = np.dot(self._pca_matrix,
                             (embeddings_batch.T - self._pca_means)).T

        # Quantize by:
        # - clipping to [min, max] range
        clipped_embeddings = np.clip(
            pca_applied, vggish_params.QUANTIZE_MIN_VAL,
            vggish_params.QUANTIZE_MAX_VAL)
        # - convert to 8-bit in range [0.0, 255.0]
        quantized_embeddings = (
                (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) *
                (255.0 /
                 (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL)))

        quantized_embeddings = quantized_embeddings.astype(np.float32)

        return quantized_embeddings