Source code for Protomix.normalize

import pandas as pd
import numpy as np

[docs] def normalize(spectra_df: pd.DataFrame, method: str = 'PQN') -> pd.DataFrame: """ Apply different normalization methods to a DataFrame of spectra. This function normalizes the spectra in the provided DataFrame using the specified method. Each row in the DataFrame represents a sample, and each column corresponds to a data point in the spectrum. :param spectra_df: The DataFrame containing the spectra, with samples as rows and data points as columns. :type spectra_df: pd.DataFrame :param method: The normalization method to apply. Options are 'PQN' (Probabilistic Quotient Normalization), 'TotalArea' (Total Area Normalization), or 'SNV' (Standard Normal Variate). Default is 'PQN'. :type method: str :return: The normalized spectra as a DataFrame. :rtype: pd.DataFrame """ if method == 'PQN': # Probabilistic Quotient Normalization (PQN) # Step 1: Reference Spectrum Creation # Calculating the median spectrum across all samples to get the reference spectrum reference_spectrum = spectra_df.median(axis=0) # Step 2: Quotient Calculation # Calculating the quotients by dividing each spectrum by the reference spectrum quotients = spectra_df.divide(reference_spectrum, axis=1) # Step 3: Median Quotient Computation # Calculating the median quotient for each spectrum to get the scaling factors scaling_factors = quotients.median(axis=1) # Ensuring the scaling_factors are not zero to avoid division by zero errors scaling_factors.replace(0, 1, inplace=True) # Step 4: Normalization # Normalizing each spectrum by its respective scaling factor normalized_spectra = spectra_df.divide(scaling_factors, axis=0) elif method == 'TotalArea': # Total Area Normalization # Calculate the total area under the curve for each spectrum total_area = spectra_df.sum(axis=1) # Normalize each spectrum by dividing by the total area normalized_spectra = spectra_df.divide(total_area, axis=0) elif method == 'SNV': # Standard Normal Variate (SNV) # Centering each spectrum centered_spectra = spectra_df.subtract(spectra_df.mean(axis=1), axis=0) # Dividing each spectrum by the standard deviation std_dev = centered_spectra.std(axis=1) normalized_spectra = centered_spectra.divide(std_dev, axis=0) else: raise ValueError("Invalid normalization method. Choose from 'PQN', 'TotalArea', 'ReferenceCompound', 'SNV'.") return normalized_spectra