Source code for VESIcal.batchfile

import pandas as pd
import os
import sys
import warnings as w

from VESIcal import core
from VESIcal import sample_class

# Turn off chained assignment pandas warning
pd.options.mode.chained_assignment = None  # default='warn'


def rename_duplicates(df, suffix='-duplicate-'):
    appendents = (suffix +
                  df.groupby(level=0).cumcount().astype(str).replace('0', ''))
    appendents = appendents.replace(suffix, '')
    return df.set_index(df.index.astype(str) + appendents)



[docs]
class status_bar(object):
    """Various styles of status bars that display the progress of a calculation
    within a loop
    """
    def __init__():
        pass

    def status_bar(percent, sample_name=None, btext=None, barLen=20):
        """
        Prints an updating status bar to the terminal or jupyter notebook.

        Parameters
        ----------
        percent: float
            Percent value of progress from 0 to 1

        sample_name: string
            Name of the current sample being calculated

        btext: string
            Any extra text to display next to status bar

        barLen: int
            Length of bar to print
        """
        sys.stdout.write("\r")
        sys.stdout.write("[{:<{}}] {:.0f}%".format("=" * int(barLen * percent),
                                                   barLen, percent * 100))

        sample_string = str(sample_name)
        # Set max number of characters in sample name
        max_name_length = 25
        if len(str(sample_name)) >= max_name_length:
            sample_string = str(sample_name)[0:max_name_length-1] + "..."

        # Write out sample name and trailing spaces to cover contents of
        # previous sample names left over on line
        if sample_name is not None:
            sys.stdout.write("  Working on sample " + sample_string +
                             "                            ")
        if btext is not None:
            sys.stdout.write(" " + str(btext))
        if percent == 1.0:
            sys.stdout.write("\n")
        sys.stdout.flush()



# ---------- BATCHFILE CLASS --------- #

[docs]
class BatchFile(object):
    """A batch file with sample names and oxide compositions

    Attributes
    ----------
        filename: str
            Path to the batch file, e.g., "my_file.xlsx". This always needs to
            be passed, even if the user is passing a pandas DataFrame rather
            than an batch file. If passing a DataFrame, filename should be set
            to None. File can be excel file (.xlsx) or .csv.

        sheet_name: str
            OPTIONAL. For Excel files. Default value is 0 which gets the first
            sheet in the batch spreadsheet file. This implements the pandas.
            read_excel() sheet_name parameter. But functionality to read in
            more than one sheet at a time (e.g.,
            pandas.read_excel(sheet_name=None)) is not yet imlpemented in
            VESIcal. From the pandas 1.0.4 documentation:

            Available cases:
            - Defaults to 0: 1st sheet as a DataFrame
            - 1: 2nd sheet as a DataFrame
            - "Sheet1": Load sheet with name “Sheet1”

        file_type: str
            OPTIONAL. Default is 'excel', which denotes that passed file has
            extension .xlsx. Other option is 'csv', which denotes that the
            passed file has extension .csv.

        units: str
            OPTIONAL. Default is 'wtpt_oxides'. String defining whether the
            oxide composition is given in wt percent ("wtpt_oxides", which is
            the default), mole oxides (mol_oxides) or mole cations
            (mol_cations).

        default_normalization:     None or str
            The type of normalization to apply to the data by default. One of:
            - None (no normalization)
            - 'standard' (default): Normalizes an input composition to 100%.
            - 'fixedvolatiles': Normalizes major element oxides to 100 wt%,
               including volatiles. The volatile wt% will remain fixed, whilst
               the other major element oxides are reduced proportionally so
               that the total is 100 wt%.
            - 'additionalvolatiles': Normalises major element oxide wt% to
              100%, assuming it is volatile-free. If H2O or CO2 are passed to
              the function, their un-normalized values will be retained in
              addition to the normalized non-volatile oxides, summing to >100%.

        default_units     str
            The type of composition to return by default, one of:
            - wtpt_oxides (default)
            - mol_oxides
            - mol_cations

        label: str
            OPTIONAL. Default is 'Label'. Name of the column within the passed
            file referring to sample names.

        dataframe: pandas DataFrame
            OPTIONAL. Default is None in which case this argument is ignored.
            This argument is used when the user wishes to turn a pandas
            DataFrame into an BatchFile object, for example when user data is
            already in python rather than being imported from a file. In this
            case set `dataframe` equal to the dataframe object being passed in.
            If using this option, pass None to filename.
    """
    def __init__(self, filename, sheet_name=0, file_type='excel',
                 units='wtpt_oxides', label='Label',
                 default_normalization='none', default_units='wtpt_oxides',
                 dataframe=None, **kwargs):
        """Return a BatchFile object whose parameters are defined here."""
        self.units = units
        self.set_default_normalization(default_normalization)
        self.set_default_units(default_units)

        if filename is not None:
            file_name, file_extension = os.path.splitext(filename)
            if file_extension == '.xlsx' or file_extension == '.xls':
                file_type = 'excel'
            if file_extension == '.csv':
                file_type = 'csv'

        if isinstance(sheet_name, str) or isinstance(sheet_name, int):
            pass
        else:
            raise core.InputError("If sheet_name is passed, it must be of "
                                  "type str or int. Currently, VESIcal cannot "
                                  "import more than one sheet at a time.")

        # handle data if passed in as existing dataframe or as file
        if dataframe is not None:
            data = dataframe
            if label is not None:
                data = self.try_set_index(data, label)
        else:
            if file_type == 'excel':
                data = pd.read_excel(filename, sheet_name=sheet_name)
                data = self.try_set_index(data, label)
            elif file_type == 'csv':
                data = pd.read_csv(filename)
                data = self.try_set_index(data, label)
            else:
                raise core.InputError("file_type must be one of \'excel\' or "
                                      "\'csv\'.")

        # Sanitize data inputs
        data = rename_duplicates(data)  # handle any duplicated sample names
        for column in data:  # convert all oxide columns to numeric
            if column in core.oxides:
                data[column] = data[column].apply(pd.to_numeric, errors='coerce')
        data = data.dropna(how='all')  # drop any rows that are all NaNs
        data = data.fillna(0)  # fill in any missing data with 0's

        if 'model' in kwargs:
            w.warn("You don't need to pass a model here, so it will be "
                   "ignored. You can specify a model when performing "
                   "calculations on your dataset (e.g., "
                   "calculate_dissolved_volatiles())",
                   RuntimeWarning, stacklevel=2)

        if 'norm' in kwargs:
            w.warn("We noticed you passed a norm argument here. This does "
                   "nothing. You can normalize your BatchFile and save it to "
                   "a new variable name after import using "
                   "normalize(BatchFileObject). See the documentation for "
                   "more info.",
                   RuntimeWarning, stacklevel=2)

        total_iron_columns = ["FeOt", "FeOT", "FeOtot", "FeOtotal", "FeOstar",
                              "FeO*"]
        for name in total_iron_columns:
            if name in data.columns:
                if 'FeO' in data.columns:
                    for row in data.itertuples():
                        if (data.at[row.Index, "FeO"] == 0 and
                           data.at[row.Index, name] > 0):
                            w.warn("Sample " + str(row.Index) + ": " +
                                   str(name) + " value of " +
                                   str(data.at[row.Index, name]) +
                                   " used as FeO. Fe2O3 set to 0.0.",
                                   RuntimeWarning, stacklevel=2)
                            data.at[row.Index, "Fe2O3"] = 0.0
                            data.at[row.Index, "FeO"] = (
                                                      data.at[row.Index, name])
                else:
                    w.warn("Total iron column " + str(name) + " detected. " +
                           "This column will be treated as FeO. If Fe2O3 " +
                           "data are not given, Fe2O3 will be 0.0. In " +
                           "future, an option to calcualte FeO/Fe2O3 based " +
                           "on fO2 will be implemented.",
                           RuntimeWarning, stacklevel=2)
                    data['FeO'] = data[name]

        if units == "wtpt_oxides":
            pass
        if units == "mol_oxides":
            data = self._molOxides_to_wtpercentOxides(data)
        if units == "mol_cations":
            data = self._molCations_to_wtpercentOxides(data)

        for column in data:
            if column in core.oxides:
                data[column][data[column] < 0] = 0

        self.data = data


[docs]
    def set_default_normalization(self, default_normalization):
        """ Set the default type of normalization to use with the
        get_composition() method.

        Parameters
        ----------
        default_normalization:    str
            The type of normalization to apply to the data. One of:
            - 'none' (no normalization)
            - 'standard' (default): Normalizes an input composition to 100%.
            - 'fixedvolatiles': Normalizes major element oxides to 100 wt%,
              including volatiles. The volatile wt% will remain fixed, whilst
              the other major element oxides are reduced proportionally so
              that the total is 100 wt%.
            - 'additionalvolatiles': Normalises major element oxide wt% to
              100%, assuming it is volatile-free. If H2O or CO2 are passed to
              the function, their un-normalized values will be retained in
              addition to the normalized non-volatile oxides, summing to >100%.

        """
        if default_normalization in ['none', 'standard', 'fixedvolatiles',
                                     'additionalvolatiles']:
            self.default_normalization = default_normalization
        else:
            raise core.InputError("The normalization method must be one of "
                                  "'none', 'standard', 'fixedvolatiles' "
                                  "or 'additionalvolatiles'.")



[docs]
    def set_default_units(self, default_units):
        """ Set the default units of composition to return when using the
        get_composition() method.

        Parameters
        ----------
        default_units     str
            The type of composition to return, one of:
            - wtpt_oxides (default)
            - mol_oxides
            - mol_cations
        """
        if default_units in ['wtpt_oxides', 'mol_oxides', 'mol_cations']:
            self.default_units = default_units
        else:
            raise core.InputError("The units must be one of 'wtpt_oxides', "
                                  "'mol_oxides','mol_cations'.")



[docs]
    def get_composition(self, species=None, normalization=None, units=None,
                        exclude_volatiles=False, asBatchFile=False):
        """ Returns a pandas DataFrame containing the compositional
        information for all samples in the BatchFile object

        Parameters
        ----------
        species:    NoneType or str
            The name of the oxide or cation to return the concentration of. If
            NoneType (default) the whole composition of each sample will be
            returned. If an oxide is passed, the value in wtpt will be
            returned unless units is set to 'mol_oxides', even if the default
            units for the sample object are mol_oxides. If an element is
            passed, the concentration will be returned as mol_cations, unless
            'mol_singleO' is specified as units, even if the default units for
            the sample object are mol_singleO. Unless normalization is
            specified in the method call, none will be applied.

        normalization:     NoneType or str
            The type of normalization to apply to the data. One of:
            - 'none' (no normalization)
            - 'standard' (default): Normalizes an input composition to 100%.
            - 'fixedvolatiles': Normalizes major element oxides to 100 wt%,
              including volatiles. The volatile wt% will remain fixed, whilst
              the other major element oxides are reduced proportionally so
              that the total is 100 wt%.
            - 'additionalvolatiles': Normalises major element oxide wt% to
              100%, assuming it is volatile-free. If H2O or CO2 are passed to
              the function, their un-normalized values will be retained in
              addition to the normalized non-volatile oxides, summing to >100%.
            If NoneType is passed the default normalization option will be
            used (self.default_normalization).

        units:     NoneType or str
            The units of composition to return, one of:
            - wtpt_oxides (default)
            - mol_oxides
            - mol_cations
            - mol_singleO
            If NoneType is passed the default units option will be used
            (self.default_type).

        exclude_volatiles   bool
            If True, volatiles will be excluded from the returned composition,
            prior to normalization and conversion.

        asBatchFile:    bool
            If True, returns a BatchFile object. If False, returns a
            pandas.DataFrame object.

        Returns
        -------
        pandas.DataFrame or BatchFile object
            All sample information.
        """
        data = self.data.copy()

        # Fetch the default return types if not specified in function call
        if normalization is None and species is None:
            normalization = self.default_normalization
        if units is None and species is None:
            units = self.default_units

        new_compositions = []
        sample_names = []
        for index, row in data.iterrows():
            sample_comp = self.get_sample_composition(index, units=units,
                                                      asSampleClass=True)
            new_compositions.append(sample_comp.get_composition(
                     species=species, normalization=normalization, units=units,
                     exclude_volatiles=exclude_volatiles))
            sample_names.append(index)
        if isinstance(new_compositions[0], pd.Series):
            return_frame = pd.concat(
                           [pd.DataFrame(j) for j in new_compositions], axis=1)
            return_frame = return_frame.transpose()
            return_frame["new_index"] = sample_names
            return_frame = return_frame.set_index("new_index")
            return_frame.index.name = None
        elif isinstance(new_compositions[0], float):
            species_data = {species: new_compositions}
            return_frame = pd.DataFrame(
                           species_data, index=[name for name in sample_names])
        else:
            return_frame = None

        if asBatchFile is False:
            return return_frame
        else:
            return BatchFile(filename=None, dataframe=return_frame, label=None)



[docs]
    def get_data(self, normalization=None, units=None, asBatchFile=False):
        """
        Returns all data stored in a BatchFile object (both compositional and
        other data). To return only the compositional data, use
        get_composition().

        Parameters
        ----------
        normalization:     NoneType or str
            The type of normalization to apply to the data. One of:
            - 'none' (no normalization)
            - 'standard' (default): Normalizes an input composition to 100%.
            - 'fixedvolatiles': Normalizes major element oxides to 100 wt%,
              including volatiles. The volatile wt% will remain fixed, whilst
              the other major element oxides are reduced proportionally so
              that the total is 100 wt%.
            - 'additionalvolatiles': Normalises major element oxide wt% to
              100%, assuming it is volatile-free. If H2O or CO2 are passed to
              the function, their un-normalized values will be retained in
              addition to the normalized non-volatile oxides, summing to >100%.

            If NoneType is passed the default normalization option will be
            used (self.default_normalization).

        units:     NoneType or str
            The units of composition to return, one of:
            - wtpt_oxides (default)
            - mol_oxides
            - mol_cations
            - mol_singleO

            If NoneType is passed the default units option will be used
            (self.default_type).

        asBatchFile:    bool
            If True, returns a BatchFile object. If False, returns a
            pandas.DataFrame object.

        Returns
        -------
        pandas.DataFrame or BatchFile object
            All sample information.
        """
        data = self.data.copy()

        # Fetch the default return units if not specified in function call
        if units is None:
            units = self.default_units

        # Fetch the default normalization if not specified in the function call
        if normalization is None:
            normalization = self.default_normalization

        # Grab all compositional data
        compositional_data = self.get_composition(normalization=normalization,
                                                  units=units)

        # Grab all non-compositional data
        non_compositional_data = data.filter(
                       [col for col in data.columns if col not in core.oxides])

        # concatenate both compositional and non-compositional dataframes
        # into one
        return_frame = pd.concat([compositional_data, non_compositional_data],
                                 axis=1)

        if asBatchFile is False:
            return return_frame
        else:
            return BatchFile(filename=None, dataframe=return_frame, label=None)



[docs]
    def get_sample_composition(self, samplename, species=None,
                               normalization=None, units=None,
                               asSampleClass=False):
        """
        Returns oxide composition of a single sample from a user-imported file
        as a dictionary

        Parameters
        ----------
        samplename: string
            Name of the desired sample

        normalization: NoneType or str
            The type of normalization to apply to the data. One of:
            - 'none' (no normalization)
            - 'standard' (default): Normalizes an input composition to 100%.
            - 'fixedvolatiles': Normalizes major element oxides to 100 wt%,
              including volatiles. The volatile wt% will remain fixed, whilst
              the other major element oxides are reduced proportionally so
              that the total is 100 wt%.
            - 'additionalvolatiles': Normalises major element oxide wt% to
              100%, assuming it is volatile-free. If H2O or CO2 are passed to
              the function, their un-normalized values will be retained in
              addition to the normalized non-volatile oxides, summing to >100%.

            If NoneType is passed the default normalization option will be
            used (self.default_normalization).

        units:     NoneType or str
            The units of composition to return, one of:
            - wtpt_oxides (default)
            - mol_oxides
            - mol_cations
            - mol_singleO
            If NoneType is passed the default units option will be used
            (self.default_type).

        asSampleClass:  bool
            If True, the sample composition will be returned as a sample
            class, with default options. In this case any normalization
            instructions will be ignored.

        Returns
        -------
        dictionary, float, or sample_class.Sample object
            Composition of the sample as oxides
        """
        # Fetch the default return types if not specified in function call
        if normalization is None and species is None:
            normalization = self.default_normalization
        if units is None and species is None:
            units = self.default_units

        # Check that normalization being chosen is one of the possible options
        if normalization in [None, 'none', 'standard', 'fixedvolatiles',
                             'additionalvolatiles']:
            pass
        else:
            raise core.InputError("The normalization method must be one of "
                                  "'none', 'standard', 'fixedvolatiles', "
                                  "or 'additionalvolatiles'.")

        data = self.data
        my_sample = pd.DataFrame(data.loc[samplename])
        sample_dict = (my_sample.to_dict()[samplename])
        sample_oxides = {}
        for item, value in sample_dict.items():
            if item in core.oxides:
                sample_oxides.update({item: value})

        _sample = sample_class.Sample(sample_oxides)

        # Get sample composition in terms of any species, units, and
        # normalization passed
        return_sample = _sample.get_composition(species=species, units=units,
                                                normalization=normalization)

        if asSampleClass:
            return sample_class.Sample(return_sample)
        else:
            if species is None:
                return dict(return_sample)
            elif isinstance(species, str):
                return return_sample


    def _molOxides_to_wtpercentOxides(self, data):
        for i, row in data.iterrows():
            sample_comp = {}
            for oxide in core.oxides:
                if oxide in data.columns:
                    sample_comp[oxide] = row[oxide]
                else:
                    sample_comp[oxide] = 0.0
            _sample = sample_class.Sample(sample_comp, units='mol_oxides')
            _sample_conv = _sample.get_composition()
            for ox in core.oxides:
                data.loc[i, oxide] = _sample_conv[oxide]
        return data

    def _molCations_to_wtpercentOxides(self, data):
        for i, row in data.iterrows():
            sample_comp = {}
            for cation in core.oxides_to_cations[core.oxides]:
                if cation in data.columns:
                    sample_comp[cation] = row[cation]
                else:
                    sample_comp[cation] = 0.0
            _sample = sample_class.Sample(sample_comp, units='mol_cations')
            _sample_conv = _sample.get_composition()
            for oxide in core.oxides:
                data.loc[i, oxide] = _sample_conv[oxide]
        return data


[docs]
    def try_set_index(self, dataframe, label):
        """
        Method to handle setting the index column in an BatchFile object. If
        no column is passed that matches the default index name, then this
        method will attempt to choose the 'best' column that the user might
        want to serve as an index column.

        Parameters
        ----------
        dataframe: pandas DataFrame

        label: str
            Name of the column within the passed Excel file referring to
            sample names.
        """
        _dataframe = dataframe.copy()
        try:
            _dataframe = _dataframe.set_index(label)
        except Exception:
            label_found = False
            for col in _dataframe.columns:
                if col in core.oxides:
                    pass
                else:
                    _dataframe = _dataframe.set_index(col)
                    label_found = True
                    w.warn("No Label column given, so column '" + str(col) +
                           "' was chosen for you. To choose your own, set " +
                           "label='<column-name>'.", RuntimeWarning,
                           stacklevel=2)
                    break
            if label_found is False:
                _dataframe.index.name = 'Label'
                w.warn("No Label column given, so one was created for you. "
                       "To choose your own, set label='<column-name>'.",
                       RuntimeWarning, stacklevel=2)

        return _dataframe



[docs]
    def save_excel(self, filename, calculations, sheet_names=None):
        """
        Saves data calculated by the user in batch processing mode (using the
        BatchFile class methods) to an organized Excel file, with the original
        user data plus any calculated data.

        Parameters
        ----------
        filename: string
            Name of the file. Extension (.xlsx) should be passed along with
            the name itself, all in quotes (e.g., 'myfile.xlsx').

        calculations: pandas DataFrame or list of pandas DataFrames
            A single DataFrame or list of DataFrames (e.g., calculated outputs
            from any of the core BatchFile functions:
            calculate_dissolved_volatiles, calculate_equilibrium_fluid_comp,
            and calculate_saturation_pressure). If None, only the original
            user data will be saved.

        sheet_names: None, string, or list
            OPTIONAL. Default value is None. Allows user to set the name of
            the sheet or sheets written to the Excel file.

        Returns
        -------
            Creates and saves an Excel file with data from each calculation
            saved to its own sheet.
        """
        if isinstance(calculations, list):
            if isinstance(sheet_names, list) or sheet_names is None:
                pass
            else:
                raise core.InputError("If calculations is passed as list, "
                                      "sheet_names must also be list of same "
                                      "length")
        elif calculations is None:
            pass
        else:
            calculations = [calculations]

        with pd.ExcelWriter(filename) as writer:
            self.data.to_excel(writer, 'Original_User_Data')
            if isinstance(calculations, list):
                if sheet_names is None:
                    for n, df in enumerate(calculations):
                        df.to_excel(writer, 'Calc%s' % n)
                elif isinstance(sheet_names, list):
                    pass
                else:
                    sheet_names = [sheet_names]
                if isinstance(sheet_names, list):
                    if len(sheet_names) == len(calculations):
                        pass
                    else:
                        raise core.InputError("calculations and sheet_names "
                                              "must have the same length")

                    for i in range(len(calculations)):
                        if isinstance(sheet_names[i], str):
                            calculations[i].to_excel(writer, sheet_names[i])
                        else:
                            raise core.InputError("if sheet_names is passed, "
                                                  "it must be list of strings")
            elif calculations is None:
                pass
        return print("Saved " + str(filename))



[docs]
    def save_csv(self, filenames, calculations, **kwargs):
        """
        Saves data calculated by the user in batch processing mode to a
        comma-separated values (csv) file. Mirros the pandas.to_csv() method.
        Any argument that can be passed to pandas.csv() can be passed here.
        One csv file will be saved for each calculation passed.

        Parameters
        ----------
        filenames: string or list of strings
            Name of the file. Extension (.csv) should be passed along with
            the name itself, all in quotes (e.g., 'myfile.csv'). The number
            of calculations passed must match the number of filenames passed.
            If passing more than one, should be passed as a list.

        calculations: pandas DataFrame or list of pandas DataFrames
            A single variable or list of variables containing calculated
            outputs from any of the core BatchFile functions:
            calculate_dissolved_volatiles, calculate_equilibrium_fluid_comp,
            and calculate_saturation_pressure.

        Returns
        -------
            Creates and saves a CSV file or files with data from each
            calculation saved to its own file.
        """
        if isinstance(filenames, list) is False:
            filenames = [filenames]
        if isinstance(calculations, list) is False:
            calculations = [calculations]
        if len(filenames) != len(calculations):
            raise core.InputError("calculations and filenames must have the "
                                  "same length")

        for i in range(len(filenames)):
            calculations[i].to_csv(filenames[i], **kwargs)
            print("Saved " + str(filenames[i]))




def from_DataFrame(dataframe, units='wtpt_oxides', label='Label'):
    """
    Transforms any pandas DataFrame object into a VESIcal BatchFile object.

    Parameters
    ----------
    dataframe: pd.DataFrame object
        DataFrame object containing samples and oxide compositions.

    units: str
        OPTIONAL. Default is 'wtpt_oxides'. String defining whether the oxide
        composition is given in wt percent ("wtpt_oxides", which is the
        default), mole fraction oxides ("mol_oxides"), or mole fraction
        cations ("mol_cations").

    label: str
        OPTIONAL. Default is 'Label'. Name of the column within the passed
        file referring to sample names. This column will be set as the index
        column.

    Returns
    -------
    VESIcal.BatchFile object
    """
    return BatchFile(filename=None, dataframe=dataframe, units=units,
                     label=label)